You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
755 lines
24 KiB
755 lines
24 KiB
// Copyright (c) AlphaSierraPapa for the SharpDevelop Team |
|
// |
|
// Permission is hereby granted, free of charge, to any person obtaining a copy of this |
|
// software and associated documentation files (the "Software"), to deal in the Software |
|
// without restriction, including without limitation the rights to use, copy, modify, merge, |
|
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons |
|
// to whom the Software is furnished to do so, subject to the following conditions: |
|
// |
|
// The above copyright notice and this permission notice shall be included in all copies or |
|
// substantial portions of the Software. |
|
// |
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
|
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR |
|
// PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE |
|
// FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
|
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
// DEALINGS IN THE SOFTWARE. |
|
|
|
using System; |
|
using System.Collections.Generic; |
|
using System.Globalization; |
|
using System.Linq; |
|
using System.Text; |
|
using ICSharpCode.NRefactory.Editor; |
|
|
|
namespace ICSharpCode.NRefactory.Xml |
|
{ |
|
class TagReader : TokenReader |
|
{ |
|
readonly TagSoupParser tagSoupParser; |
|
|
|
public TagReader(TagSoupParser tagSoupParser, ITextSource input) : base(input) |
|
{ |
|
this.tagSoupParser = tagSoupParser; |
|
} |
|
|
|
public InternalObject[] ReadAllObjects() |
|
{ |
|
while (HasMoreData()) { |
|
ReadObject(); |
|
} |
|
var arr = objects.ToArray(); |
|
objects.Clear(); |
|
return arr; |
|
} |
|
|
|
/// <summary> |
|
/// Reads one or more objects. |
|
/// </summary> |
|
void ReadObject() |
|
{ |
|
if (TryPeek('<')) { |
|
ReadTag(); |
|
} else { |
|
ReadText(TextType.CharacterData); |
|
} |
|
} |
|
|
|
#region BeginInternalObject / EndInternalObject |
|
List<InternalObject> objects = new List<InternalObject>(); |
|
int internalObjectStartPosition; |
|
|
|
int CurrentRelativeLocation { |
|
get { return CurrentLocation - internalObjectStartPosition; } |
|
} |
|
|
|
struct InternalObjectFrame |
|
{ |
|
public readonly InternalObject InternalObject; |
|
public readonly int ParentStartPosition; |
|
|
|
public InternalObjectFrame(InternalObject internalObject, int parentStartPosition) |
|
{ |
|
this.InternalObject = internalObject; |
|
this.ParentStartPosition = parentStartPosition; |
|
} |
|
} |
|
|
|
InternalObjectFrame BeginInternalObject(InternalObject internalObject) |
|
{ |
|
return BeginInternalObject(internalObject, this.CurrentLocation); |
|
} |
|
|
|
InternalObjectFrame BeginInternalObject(InternalObject internalObject, int beginLocation) |
|
{ |
|
internalObject.StartRelativeToParent = beginLocation - internalObjectStartPosition; |
|
|
|
var frame = new InternalObjectFrame(internalObject, internalObjectStartPosition); |
|
|
|
internalObjectStartPosition = CurrentLocation; |
|
return frame; |
|
} |
|
|
|
void EndInternalObject(InternalObjectFrame frame, bool storeNewObject = true) |
|
{ |
|
frame.InternalObject.Length = this.CurrentRelativeLocation; |
|
frame.InternalObject.SyntaxErrors = GetSyntaxErrors(); |
|
if (storeNewObject) |
|
objects.Add(frame.InternalObject); |
|
internalObjectStartPosition = frame.ParentStartPosition; |
|
} |
|
#endregion |
|
|
|
#region Read Tag |
|
/// <summary> |
|
/// Context: "<" |
|
/// </summary> |
|
void ReadTag() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
int tagStart = this.CurrentLocation; |
|
InternalTag tag = new InternalTag(); |
|
var frame = BeginInternalObject(tag); |
|
|
|
// Read the opening bracket |
|
// It identifies the type of tag and parsing behavior for the rest of it |
|
tag.OpeningBracket = ReadOpeningBracket(); |
|
|
|
if (tag.IsUnknownBang && !TryPeekWhiteSpace()) |
|
OnSyntaxError(tagStart, this.CurrentLocation, "Unknown tag"); |
|
|
|
if (tag.IsStartOrEmptyTag || tag.IsEndTag || tag.IsProcessingInstruction) { |
|
// Read the name |
|
TryMoveToNonWhiteSpace(); |
|
tag.RelativeNameStart = this.CurrentRelativeLocation; |
|
string name; |
|
if (TryReadName(out name)) { |
|
if (!IsValidName(name)) { |
|
OnSyntaxError(this.CurrentLocation - name.Length, this.CurrentLocation, "The name '{0}' is invalid", name); |
|
} |
|
} else { |
|
OnSyntaxError("Element name expected"); |
|
} |
|
tag.Name = name; |
|
} else { |
|
tag.Name = string.Empty; |
|
} |
|
|
|
bool isXmlDeclr = tag.Name == "xml" && tag.IsProcessingInstruction; |
|
int oldObjectCount = objects.Count; |
|
|
|
if (tag.IsStartOrEmptyTag || tag.IsEndTag || isXmlDeclr) { |
|
// Read attributes for the tag |
|
while (HasMoreData()) { |
|
// Chech for all forbiden 'name' characters first - see ReadName |
|
TryMoveToNonWhiteSpace(); |
|
if (TryPeek('<')) break; |
|
string endBr; |
|
int endBrStart = this.CurrentLocation; // Just peek |
|
if (TryReadClosingBracket(out endBr)) { // End tag |
|
GoBack(endBrStart); |
|
break; |
|
} |
|
|
|
// We have "=\'\"" or name - read attribute |
|
int attrStartOffset = this.CurrentLocation; |
|
ReadAttribute(); |
|
if (tag.IsEndTag) |
|
OnSyntaxError(attrStartOffset, this.CurrentLocation, "Attribute not allowed in end tag."); |
|
} |
|
} else if (tag.IsDocumentType) { |
|
ReadContentOfDTD(); |
|
} else { |
|
int start = this.CurrentLocation; |
|
if (tag.IsComment) { |
|
ReadText(TextType.Comment); |
|
} else if (tag.IsCData) { |
|
ReadText(TextType.CData); |
|
} else if (tag.IsProcessingInstruction) { |
|
ReadText(TextType.ProcessingInstruction); |
|
} else if (tag.IsUnknownBang) { |
|
ReadText(TextType.UnknownBang); |
|
} else { |
|
throw new InternalException(string.Format(CultureInfo.InvariantCulture, "Unknown opening bracket '{0}'", tag.OpeningBracket)); |
|
} |
|
// Backtrack at complete start |
|
if (IsEndOfFile() || (tag.IsUnknownBang && TryPeek('<'))) { |
|
GoBack(start); |
|
objects.RemoveRange(oldObjectCount, objects.Count - oldObjectCount); |
|
} |
|
} |
|
|
|
// Read closing bracket |
|
string bracket; |
|
TryReadClosingBracket(out bracket); |
|
tag.ClosingBracket = bracket; |
|
|
|
// Error check |
|
int brStart = this.CurrentLocation - (tag.ClosingBracket ?? string.Empty).Length; |
|
int brEnd = this.CurrentLocation; |
|
if (tag.Name == null) { |
|
// One error was reported already |
|
} else if (tag.IsStartOrEmptyTag) { |
|
if (tag.ClosingBracket != ">" && tag.ClosingBracket != "/>") OnSyntaxError(brStart, brEnd, "'>' or '/>' expected"); |
|
} else if (tag.IsEndTag) { |
|
if (tag.ClosingBracket != ">") OnSyntaxError(brStart, brEnd, "'>' expected"); |
|
} else if (tag.IsComment) { |
|
if (tag.ClosingBracket != "-->") OnSyntaxError(brStart, brEnd, "'-->' expected"); |
|
} else if (tag.IsCData) { |
|
if (tag.ClosingBracket != "]]>") OnSyntaxError(brStart, brEnd, "']]>' expected"); |
|
} else if (tag.IsProcessingInstruction) { |
|
if (tag.ClosingBracket != "?>") OnSyntaxError(brStart, brEnd, "'?>' expected"); |
|
} else if (tag.IsUnknownBang) { |
|
if (tag.ClosingBracket != ">") OnSyntaxError(brStart, brEnd, "'>' expected"); |
|
} else if (tag.IsDocumentType) { |
|
if (tag.ClosingBracket != ">") OnSyntaxError(brStart, brEnd, "'>' expected"); |
|
} else { |
|
throw new InternalException(string.Format(CultureInfo.InvariantCulture, "Unknown opening bracket '{0}'", tag.OpeningBracket)); |
|
} |
|
|
|
// Attribute name may not apper multiple times |
|
if (objects.Count > oldObjectCount) { |
|
// Move nested objects into tag.NestedObjects: |
|
tag.NestedObjects = new InternalObject[objects.Count - oldObjectCount]; |
|
objects.CopyTo(oldObjectCount, tag.NestedObjects, 0, tag.NestedObjects.Length); |
|
objects.RemoveRange(oldObjectCount, objects.Count - oldObjectCount); |
|
|
|
// Look for duplicate attributes: |
|
HashSet<string> attributeNames = new HashSet<string>(); |
|
foreach (var obj in tag.NestedObjects) { |
|
InternalAttribute attr = obj as InternalAttribute; |
|
if (attr != null && attributeNames.Add(attr.Name)) { |
|
int attrStart = tagStart + attr.StartRelativeToParent; |
|
OnSyntaxError(attrStart, attrStart + attr.Name.Length, "Attribute with name '{0}' already exists", attr.Name); |
|
} |
|
} |
|
} |
|
|
|
EndInternalObject(frame); |
|
} |
|
#endregion |
|
|
|
#region Read DTD |
|
void ReadContentOfDTD() |
|
{ |
|
int start = this.CurrentLocation; |
|
while (HasMoreData()) { |
|
TryMoveToNonWhiteSpace(); // Skip whitespace |
|
if (TryRead('\'')) TryMoveTo('\''); // Skip single quoted string TODO: Bug |
|
if (TryRead('\"')) TryMoveTo('\"'); // Skip single quoted string |
|
if (TryRead('[')) { // Start of nested infoset |
|
// Reading infoset |
|
while (HasMoreData()) { |
|
TryMoveToAnyOf('<', ']'); |
|
if (TryPeek('<')) { |
|
if (start != this.CurrentLocation) { // Two following tags |
|
MakeText(start, this.CurrentLocation); |
|
} |
|
ReadTag(); |
|
start = this.CurrentLocation; |
|
} |
|
if (TryPeek(']')) break; |
|
} |
|
} |
|
TryRead(']'); // End of nested infoset |
|
if (TryPeek('>')) break; // Proper closing |
|
if (TryPeek('<')) break; // Malformed XML |
|
TryMoveNext(); // Skip anything else |
|
} |
|
if (start != this.CurrentLocation) { |
|
MakeText(start, this.CurrentLocation); |
|
} |
|
} |
|
|
|
void MakeText(int start, int end) |
|
{ |
|
Log.DebugAssert(end > start, "Empty text"); |
|
Log.DebugAssert(end == this.CurrentLocation, "end == current location"); |
|
|
|
InternalText text = new InternalText(); |
|
var frame = BeginInternalObject(text, start); |
|
text.Type = TextType.Other; |
|
text.Value = GetText(start, end); |
|
EndInternalObject(frame); |
|
} |
|
#endregion |
|
|
|
#region Read Brackets |
|
/// <summary> |
|
/// Reads any of the know opening brackets. (only full bracket) |
|
/// Context: "<" |
|
/// </summary> |
|
string ReadOpeningBracket() |
|
{ |
|
// We are using a lot of string literals so that the memory instances are shared |
|
//int start = this.CurrentLocation; |
|
if (TryRead('<')) { |
|
if (TryRead('/')) { |
|
return "</"; |
|
} else if (TryRead('?')) { |
|
return "<?"; |
|
} else if (TryRead('!')) { |
|
if (TryRead("--")) { |
|
return "<!--"; |
|
} else if (TryRead("[CDATA[")) { |
|
return "<![CDATA["; |
|
} else { |
|
foreach (string dtdName in AXmlTag.DtdNames) { |
|
// the dtdName includes "<!" |
|
if (TryRead(dtdName.Remove(0, 2))) return dtdName; |
|
} |
|
return "<!"; |
|
} |
|
} else { |
|
return "<"; |
|
} |
|
} else { |
|
throw new InternalException("'<' expected"); |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Reads any of the know closing brackets. (only full bracket) |
|
/// Context: any |
|
/// </summary> |
|
bool TryReadClosingBracket(out string bracket) |
|
{ |
|
// We are using a lot of string literals so that the memory instances are shared |
|
if (TryRead('>')) { |
|
bracket = ">"; |
|
} else if (TryRead("/>")) { |
|
bracket = "/>"; |
|
} else if (TryRead("?>")) { |
|
bracket = "?>"; |
|
} else if (TryRead("-->")) { |
|
bracket = "-->"; |
|
} else if (TryRead("]]>")) { |
|
bracket = "]]>"; |
|
} else { |
|
bracket = string.Empty; |
|
return false; |
|
} |
|
return true; |
|
} |
|
#endregion |
|
|
|
#region Attributes |
|
/// <summary> |
|
/// Context: name or "=\'\"" |
|
/// </summary> |
|
void ReadAttribute() |
|
{ |
|
AssertHasMoreData(); |
|
|
|
InternalAttribute attr = new InternalAttribute(); |
|
var frame = BeginInternalObject(attr); |
|
|
|
// Read name |
|
string name; |
|
if (TryReadName(out name)) { |
|
if (!IsValidName(name)) { |
|
OnSyntaxError(this.CurrentLocation - name.Length, this.CurrentLocation, "The name '{0}' is invalid", name); |
|
} |
|
} else { |
|
OnSyntaxError("Attribute name expected"); |
|
} |
|
attr.Name = name; |
|
|
|
// Read equals sign and surrounding whitespace |
|
int checkpoint = this.CurrentLocation; |
|
TryMoveToNonWhiteSpace(); |
|
if (TryRead('=')) { |
|
int chk2 = this.CurrentLocation; |
|
TryMoveToNonWhiteSpace(); |
|
if (!TryPeek('"') && !TryPeek('\'')) { |
|
// Do not read whitespace if quote does not follow |
|
GoBack(chk2); |
|
} |
|
attr.EqualsSignLength = this.CurrentLocation - checkpoint; |
|
} else { |
|
GoBack(checkpoint); |
|
OnSyntaxError("'=' expected"); |
|
attr.EqualsSignLength = 0; |
|
} |
|
|
|
// Read attribute value |
|
int start = this.CurrentLocation; |
|
char quoteChar = TryPeek('"') ? '"' : '\''; |
|
bool startsWithQuote; |
|
if (TryRead(quoteChar)) { |
|
startsWithQuote = true; |
|
int valueStart = this.CurrentLocation; |
|
TryMoveToAnyOf(quoteChar, '<'); |
|
if (TryRead(quoteChar)) { |
|
if (!TryPeekAnyOf(' ', '\t', '\n', '\r', '/', '>', '?')) { |
|
if (TryPeekPrevious('=', 2) || (TryPeekPrevious('=', 3) && TryPeekPrevious(' ', 2))) { |
|
// This actually most likely means that we are in the next attribute value |
|
GoBack(valueStart); |
|
ReadAttributeValue(quoteChar); |
|
if (TryRead(quoteChar)) { |
|
OnSyntaxError("White space or end of tag expected"); |
|
} else { |
|
OnSyntaxError("Quote {0} expected (or add whitespace after the following one)", quoteChar); |
|
} |
|
} else { |
|
OnSyntaxError("White space or end of tag expected"); |
|
} |
|
} |
|
} else { |
|
// '<' or end of file |
|
GoBack(valueStart); |
|
ReadAttributeValue(quoteChar); |
|
OnSyntaxError("Quote {0} expected", quoteChar); |
|
} |
|
} else { |
|
startsWithQuote = false; |
|
int valueStart = this.CurrentLocation; |
|
ReadAttributeValue(null); |
|
TryRead('\"'); |
|
TryRead('\''); |
|
if (valueStart == this.CurrentLocation) { |
|
OnSyntaxError("Attribute value expected"); |
|
} else { |
|
OnSyntaxError(valueStart, this.CurrentLocation, "Attribute value must be quoted"); |
|
} |
|
} |
|
string val = GetText(start, this.CurrentLocation); |
|
val = Unquote(val); |
|
attr.Value = Dereference(val, startsWithQuote ? start + 1 : start); |
|
|
|
EndInternalObject(frame); |
|
} |
|
|
|
/// <summary> |
|
/// Read everything up to quote (excluding), opening/closing tag or attribute signature |
|
/// </summary> |
|
void ReadAttributeValue(char? quote) |
|
{ |
|
while (HasMoreData()) { |
|
// What is next? |
|
int start = this.CurrentLocation; |
|
TryMoveToNonWhiteSpace(); // Read white space (if any) |
|
if (quote.HasValue) { |
|
if (TryPeek(quote.Value)) return; |
|
} else { |
|
if (TryPeek('"') || TryPeek('\'')) return; |
|
} |
|
// Opening/closing tag |
|
string endBr; |
|
if (TryPeek('<') || TryReadClosingBracket(out endBr)) { |
|
GoBack(start); |
|
return; |
|
} |
|
// Try reading attribute signature |
|
if (TryReadName()) { |
|
int nameEnd = this.CurrentLocation; |
|
if (TryMoveToNonWhiteSpace() && TryRead("=") && |
|
TryMoveToNonWhiteSpace() && TryPeekAnyOf('"', '\'')) |
|
{ |
|
// Start of attribute. Great |
|
GoBack(start); |
|
return; // Done |
|
} else { |
|
// Just some gargabe - make it part of the value |
|
GoBack(nameEnd); |
|
continue; // Read more |
|
} |
|
} |
|
TryMoveNext(); // Accept everyting else |
|
} |
|
} |
|
|
|
/// <summary> Remove quoting from the given string </summary> |
|
static string Unquote(string quoted) |
|
{ |
|
if (string.IsNullOrEmpty(quoted)) return string.Empty; |
|
char first = quoted[0]; |
|
if (quoted.Length == 1) return (first == '"' || first == '\'') ? string.Empty : quoted; |
|
char last = quoted[quoted.Length - 1]; |
|
if (first == '"' || first == '\'') { |
|
if (first == last) { |
|
// Remove both quotes |
|
return quoted.Substring(1, quoted.Length - 2); |
|
} else { |
|
// Remove first quote |
|
return quoted.Remove(0, 1); |
|
} |
|
} else { |
|
if (last == '"' || last == '\'') { |
|
// Remove last quote |
|
return quoted.Substring(0, quoted.Length - 1); |
|
} else { |
|
// Keep whole string |
|
return quoted; |
|
} |
|
} |
|
} |
|
#endregion |
|
|
|
#region Text |
|
/// <summary> |
|
/// Reads text and optionaly separates it into fragments. |
|
/// It can also return empty set for no appropriate text input. |
|
/// Make sure you enumerate it only once |
|
/// </summary> |
|
void ReadText(TextType type) |
|
{ |
|
const int maxTextFragmentSize = 128; |
|
bool finished; |
|
do { |
|
var text = new InternalText(); |
|
var frame = BeginInternalObject(text); |
|
text.Type = type; |
|
|
|
// Limit the reading to just a few characters |
|
// (the first character not to be read) |
|
int fragmentEnd = Math.Min(this.CurrentLocation + maxTextFragmentSize, this.InputLength); |
|
|
|
int start = this.CurrentLocation; |
|
|
|
// Whitespace would be skipped anyway by any operation |
|
TryMoveToNonWhiteSpace(fragmentEnd); |
|
int wsEnd = this.CurrentLocation; |
|
|
|
// Try move to the terminator given by the context |
|
if (type == TextType.WhiteSpace) { |
|
TryMoveToNonWhiteSpace(fragmentEnd); |
|
} else if (type == TextType.CharacterData) { |
|
while(true) { |
|
if (!TryMoveToAnyOf(new char[] {'<', ']'}, fragmentEnd)) break; // End of fragment |
|
if (TryPeek('<')) break; |
|
if (TryPeek(']')) { |
|
if (TryPeek("]]>")) { |
|
OnSyntaxError(this.CurrentLocation, this.CurrentLocation + 3, "']]>' is not allowed in text"); |
|
} |
|
TryMoveNext(); |
|
continue; |
|
} |
|
throw new InternalException("Infinite loop"); |
|
} |
|
} else if (type == TextType.Comment) { |
|
// Do not report too many errors |
|
bool errorReported = false; |
|
while(true) { |
|
if (!TryMoveTo('-', fragmentEnd)) break; // End of fragment |
|
if (TryPeek("-->")) break; |
|
if (TryPeek("--") && !errorReported) { |
|
OnSyntaxError(this.CurrentLocation, this.CurrentLocation + 2, "'--' is not allowed in comment"); |
|
errorReported = true; |
|
} |
|
TryMoveNext(); |
|
} |
|
} else if (type == TextType.CData) { |
|
while(true) { |
|
// We can not use use TryMoveTo("]]>", fragmentEnd) because it may incorectly accept "]" at the end of fragment |
|
if (!TryMoveTo(']', fragmentEnd)) break; // End of fragment |
|
if (TryPeek("]]>")) break; |
|
TryMoveNext(); |
|
} |
|
} else if (type == TextType.ProcessingInstruction) { |
|
while(true) { |
|
if (!TryMoveTo('?', fragmentEnd)) break; // End of fragment |
|
if (TryPeek("?>")) break; |
|
TryMoveNext(); |
|
} |
|
} else if (type == TextType.UnknownBang) { |
|
TryMoveToAnyOf(new char[] {'<', '>'}, fragmentEnd); |
|
} else { |
|
throw new InternalException("Uknown type " + type); |
|
} |
|
|
|
text.ContainsOnlyWhitespace = (wsEnd == this.CurrentLocation); |
|
|
|
// Terminal found or real end was reached; |
|
finished = this.CurrentLocation < fragmentEnd || IsEndOfFile(); |
|
|
|
if (!finished) { |
|
// We have to continue reading more text fragments |
|
|
|
// If there is entity reference, make sure the next segment starts with it to prevent framentation |
|
int entitySearchStart = Math.Max(start + 1 /* data for us */, this.CurrentLocation - maxEntityLength); |
|
int entitySearchLength = this.CurrentLocation - entitySearchStart; |
|
if (entitySearchLength > 0) { |
|
// Note that LastIndexOf works backward |
|
int entityIndex = input.LastIndexOf('&', this.CurrentLocation - entitySearchLength, entitySearchLength); |
|
if (entityIndex != -1) { |
|
GoBack(entityIndex); |
|
} |
|
} |
|
} |
|
|
|
string escapedValue = GetText(start, this.CurrentLocation); |
|
if (type == TextType.CharacterData) { |
|
// Normalize end of line first |
|
text.Value = Dereference(NormalizeEndOfLine(escapedValue), start); |
|
} else { |
|
text.Value = escapedValue; |
|
} |
|
text.Value = GetCachedString(text.Value); |
|
|
|
EndInternalObject(frame, storeNewObject: this.CurrentLocation > start); |
|
|
|
} while (!finished); |
|
} |
|
#endregion |
|
|
|
#region Dereference |
|
const int maxEntityLength = 16; // The longest built-in one is 10 ("") |
|
|
|
string Dereference(string text, int textLocation) |
|
{ |
|
StringBuilder sb = null; // The dereferenced text so far (all up to 'curr') |
|
int curr = 0; |
|
while(true) { |
|
// Reached end of input |
|
if (curr == text.Length) { |
|
if (sb != null) { |
|
return sb.ToString(); |
|
} else { |
|
return text; |
|
} |
|
} |
|
|
|
// Try to find reference |
|
int start = text.IndexOf('&', curr); |
|
|
|
// No more references found |
|
if (start == -1) { |
|
if (sb != null) { |
|
sb.Append(text, curr, text.Length - curr); // Add rest |
|
return sb.ToString(); |
|
} else { |
|
return text; |
|
} |
|
} |
|
|
|
// Append text before the enitiy reference |
|
if (sb == null) sb = new StringBuilder(text.Length); |
|
sb.Append(text, curr, start - curr); |
|
curr = start; |
|
|
|
// Process the entity |
|
int errorLoc = textLocation + sb.Length; |
|
|
|
// Find entity name |
|
int end = text.IndexOfAny(new char[] {'&', ';'}, start + 1, Math.Min(maxEntityLength, text.Length - (start + 1))); |
|
if (end == -1 || text[end] == '&') { |
|
// Not found |
|
OnSyntaxError(errorLoc, errorLoc + 1, "Entity reference must be terminated with ';'"); |
|
// Keep '&' |
|
sb.Append('&'); |
|
curr++; |
|
continue; // Restart and next character location |
|
} |
|
string name = text.Substring(start + 1, end - (start + 1)); |
|
|
|
// Resolve the name |
|
string replacement; |
|
if (name.Length == 0) { |
|
replacement = null; |
|
OnSyntaxError(errorLoc + 1, errorLoc + 1, "Entity name expected"); |
|
} else if (name == "amp") { |
|
replacement = "&"; |
|
} else if (name == "lt") { |
|
replacement = "<"; |
|
} else if (name == "gt") { |
|
replacement = ">"; |
|
} else if (name == "apos") { |
|
replacement = "'"; |
|
} else if (name == "quot") { |
|
replacement = "\""; |
|
} else if (name.Length > 0 && name[0] == '#') { |
|
int num; |
|
if (name.Length > 1 && name[1] == 'x') { |
|
if (!int.TryParse(name.Substring(2), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture.NumberFormat, out num)) { |
|
num = -1; |
|
OnSyntaxError(errorLoc + 3, errorLoc + 1 + name.Length, "Hexadecimal code of unicode character expected"); |
|
} |
|
} else { |
|
if (!int.TryParse(name.Substring(1), NumberStyles.None, CultureInfo.InvariantCulture.NumberFormat, out num)) { |
|
num = -1; |
|
OnSyntaxError(errorLoc + 2, errorLoc + 1 + name.Length, "Numeric code of unicode character expected"); |
|
} |
|
} |
|
if (num != -1) { |
|
try { |
|
replacement = char.ConvertFromUtf32(num); |
|
} catch (ArgumentOutOfRangeException) { |
|
replacement = null; |
|
OnSyntaxError(errorLoc + 2, errorLoc + 1 + name.Length, "Invalid unicode character U+{0:X} ({0})", num); |
|
} |
|
} else { |
|
replacement = null; |
|
} |
|
} else if (!IsValidName(name)) { |
|
replacement = null; |
|
OnSyntaxError(errorLoc + 1, errorLoc + 1, "Invalid entity name"); |
|
} else { |
|
replacement = null; |
|
if (tagSoupParser.UnknownEntityReferenceIsError) { |
|
OnSyntaxError(errorLoc, errorLoc + 1 + name.Length + 1, "Unknown entity reference '{0}'", name); |
|
} |
|
} |
|
|
|
// Append the replacement to output |
|
if (replacement != null) { |
|
sb.Append(replacement); |
|
} else { |
|
sb.Append('&'); |
|
sb.Append(name); |
|
sb.Append(';'); |
|
} |
|
curr = end + 1; |
|
continue; |
|
} |
|
} |
|
#endregion |
|
|
|
#region Syntax Errors |
|
List<InternalSyntaxError> syntaxErrors = new List<InternalSyntaxError>(); |
|
|
|
InternalSyntaxError[] GetSyntaxErrors() |
|
{ |
|
if (syntaxErrors.Count > 0) { |
|
var arr = syntaxErrors.ToArray(); |
|
syntaxErrors.Clear(); |
|
return arr; |
|
} else { |
|
return null; |
|
} |
|
} |
|
|
|
void OnSyntaxError(string message, params object[] args) |
|
{ |
|
OnSyntaxError(this.CurrentLocation, this.CurrentLocation + 1, message, args); |
|
} |
|
|
|
void OnSyntaxError(int start, int end, string message, params object[] args) |
|
{ |
|
if (end <= start) end = start + 1; |
|
string formattedMessage = string.Format(CultureInfo.InvariantCulture, message, args); |
|
Log.WriteLine("Syntax error ({0}-{1}): {2}", start, end, formattedMessage); |
|
syntaxErrors.Add(new InternalSyntaxError(start - internalObjectStartPosition, end - internalObjectStartPosition, formattedMessage)); |
|
} |
|
#endregion |
|
|
|
#region Helper functions |
|
static bool IsValidName(string name) |
|
{ |
|
try { |
|
System.Xml.XmlConvert.VerifyName(name); |
|
return true; |
|
} catch (System.Xml.XmlException) { |
|
return false; |
|
} |
|
} |
|
|
|
static string NormalizeEndOfLine(string text) |
|
{ |
|
return text.Replace("\r\n", "\n").Replace('\r', '\n'); |
|
} |
|
#endregion |
|
} |
|
}
|
|
|