Browse Source

XML Parser:

- Optimized text reading functioning
 - Simplified ReadText method

git-svn-id: svn://svn.sharpdevelop.net/sharpdevelop/trunk@4629 1ccf3a8d-04fe-1044-b7c0-cef0b8235c61
shortcuts
David Srbecký 17 years ago
parent
commit
eb190567a5
  1. 238
      src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs

238
src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs

@ -88,8 +88,6 @@ namespace ICSharpCode.AvalonEdit.XmlParser
public class XmlParser public class XmlParser
{ {
// TODO: Simple tag matching heuristic // TODO: Simple tag matching heuristic
// TODO: Delete some read functions and optimize performance
// TODO: Rewrite ReadText
RawDocument userDocument; RawDocument userDocument;
TextDocument textDocument; TextDocument textDocument;
@ -166,7 +164,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
currentLocation = 0; currentLocation = 0;
maxTouchedLocation = 0; maxTouchedLocation = 0;
readingEnd = input.Length; inputLength = input.Length;
RawDocument parsedDocument = ReadDocument(); RawDocument parsedDocument = ReadDocument();
// Just in case parse method was called redundantly // Just in case parse method was called redundantly
@ -281,8 +279,10 @@ namespace ICSharpCode.AvalonEdit.XmlParser
}); });
} }
#region Text reading methods
string input; string input;
int readingEnd; int inputLength;
// Do not ever set the value from parsing methods // Do not ever set the value from parsing methods
// most importantly do not backtrack except with GoBack(int) // most importantly do not backtrack except with GoBack(int)
int currentLocation; int currentLocation;
@ -295,24 +295,24 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool IsEndOfFile() bool IsEndOfFile()
{ {
return currentLocation == readingEnd; return currentLocation == inputLength;
} }
bool HasMoreData() bool HasMoreData()
{ {
return currentLocation < readingEnd; return currentLocation < inputLength;
} }
void AssertHasMoreData() void AssertHasMoreData()
{ {
if (currentLocation == readingEnd) { if (currentLocation == inputLength) {
throw new Exception("Unexpected end of files"); throw new Exception("Unexpected end of file");
} }
} }
bool TryMoveNext() bool TryMoveNext()
{ {
if (currentLocation == readingEnd) return false; if (currentLocation == inputLength) return false;
currentLocation++; currentLocation++;
return true; return true;
@ -327,7 +327,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool TryRead(char c) bool TryRead(char c)
{ {
if (currentLocation == readingEnd) return false; if (currentLocation == inputLength) return false;
if (input[currentLocation] == c) { if (input[currentLocation] == c) {
currentLocation++; currentLocation++;
@ -337,22 +337,22 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
} }
bool TryRead(string text) bool TryReadAnyOf(params char[] c)
{ {
if (TryPeek(text)) { if (currentLocation == inputLength) return false;
currentLocation += text.Length;
if (c.Contains(input[currentLocation])) {
currentLocation++;
return true; return true;
} else { } else {
return false; return false;
} }
} }
/// <summary> Returns true if at least one character was read </summary> bool TryRead(string text)
bool TryReadPartOf(string text)
{ {
if (TryPeek(text[0])) { if (TryPeek(text)) {
// Keep reading until character differs or we have end of file currentLocation += text.Length;
foreach(char c in text) if (!TryRead(c)) break;
return true; return true;
} else { } else {
return false; return false;
@ -361,7 +361,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool TryPeekPrevious(char c, int back) bool TryPeekPrevious(char c, int back)
{ {
if (currentLocation - back == readingEnd) return false; if (currentLocation - back == inputLength) return false;
if (currentLocation - back < 0 ) return false; if (currentLocation - back < 0 ) return false;
return input[currentLocation - back] == c; return input[currentLocation - back] == c;
@ -369,14 +369,14 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool TryPeek(char c) bool TryPeek(char c)
{ {
if (currentLocation == readingEnd) return false; if (currentLocation == inputLength) return false;
return input[currentLocation] == c; return input[currentLocation] == c;
} }
bool TryPeekAnyOf(params char[] chars) bool TryPeekAnyOf(params char[] chars)
{ {
if (currentLocation == readingEnd) return false; if (currentLocation == inputLength) return false;
return chars.Contains(input[currentLocation]); return chars.Contains(input[currentLocation]);
} }
@ -386,69 +386,87 @@ namespace ICSharpCode.AvalonEdit.XmlParser
if (!TryPeek(text[0])) return false; // Early exit if (!TryPeek(text[0])) return false; // Early exit
maxTouchedLocation = Math.Max(maxTouchedLocation, currentLocation + (text.Length - 1)); maxTouchedLocation = Math.Max(maxTouchedLocation, currentLocation + (text.Length - 1));
// The following comparison 'touches' the end of file // The following comparison 'touches' the end of file - it does depend on the end being there
if (currentLocation + text.Length > readingEnd) return false; if (currentLocation + text.Length > inputLength) return false;
return input.Substring(currentLocation, text.Length) == text; return input.Substring(currentLocation, text.Length) == text;
} }
bool TryPeekWhiteSpace()
{
if (currentLocation == inputLength) return false;
char c = input[currentLocation];
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
}
// The move functions do not have to move if already at target
// The move functions allow 'overriding' of the document length
bool TryMoveTo(char c) bool TryMoveTo(char c)
{ {
while(true) { return TryMoveTo(c, inputLength);
if (currentLocation == readingEnd) return false;
if (input[currentLocation] == c) return true;
currentLocation++;
}
} }
bool TryMoveTo(string text) bool TryMoveTo(char c, int inputLength)
{ {
while(true) { if (currentLocation == inputLength) return false;
if (!TryMoveTo(text[0])) return false; // End of file int index = input.IndexOf(c, currentLocation, inputLength - currentLocation);
if (TryPeek(text)) return true; if (index != -1) {
currentLocation++; currentLocation = index;
return true;
} else {
currentLocation = inputLength;
return false;
} }
} }
bool TryMoveToAnyOf(params char[] c) bool TryMoveToAnyOf(params char[] c)
{ {
while(true) { return TryMoveToAnyOf(c, inputLength);
if (currentLocation == readingEnd) return false; }
if (c.Contains(input[currentLocation])) return true;
currentLocation++; bool TryMoveToAnyOf(char[] c, int inputLength)
{
if (currentLocation == inputLength) return false;
int index = input.IndexOfAny(c, currentLocation, inputLength - currentLocation);
if (index != -1) {
currentLocation = index;
return true;
} else {
currentLocation = inputLength;
return false;
} }
} }
bool TryMoveToNonWhiteSpace() bool TryMoveTo(string text)
{ {
while (TryPeekWhiteSpace()) TryMoveNext(); return TryMoveTo(text, inputLength);
return HasMoreData();
} }
string GetText(int start, int end) bool TryMoveTo(string text, int inputLength)
{ {
if (start == readingEnd && end == readingEnd) { if (currentLocation == inputLength) return false;
return string.Empty; int index = input.IndexOf(text, currentLocation, inputLength - currentLocation, StringComparison.Ordinal);
if (index != -1) {
maxTouchedLocation = index + text.Length - 1;
currentLocation = index;
return true;
} else { } else {
return GetCachedString(input.Substring(start, end - start)); currentLocation = inputLength;
return false;
} }
} }
static char[] WhiteSpaceChars = new char[] {' ', '\n', '\r', '\t'}; bool TryMoveToNonWhiteSpace()
static char[] WhiteSpaceAndReservedChars = new char[] {' ', '\n', '\r', '\t', '=', '\'', '"', '<', '>', '/', '?'};
bool TryPeekWhiteSpace()
{ {
if (currentLocation == readingEnd) return false; return TryMoveToNonWhiteSpace(inputLength);
return WhiteSpaceChars.Contains(input[currentLocation]);
} }
bool TryPeekNameChar() bool TryMoveToNonWhiteSpace(int inputLength)
{ {
if (currentLocation == readingEnd) return false; while(TryPeekWhiteSpace()) currentLocation++;
return HasMoreData();
return !WhiteSpaceAndReservedChars.Contains(input[currentLocation]);
} }
/// <summary> /// <summary>
@ -457,19 +475,34 @@ namespace ICSharpCode.AvalonEdit.XmlParser
/// "" End of file /// "" End of file
/// " \n\r\t" Whitesapce /// " \n\r\t" Whitesapce
/// "=\'\"" Attribute value /// "=\'\"" Attribute value
/// "&lt;" Openning Tag /// "&lt;>/?" Tags
/// ">/?" Closing Tag
/// </summary> /// </summary>
/// <returns> True if read at least one character </returns>
bool TryReadName(out string res) bool TryReadName(out string res)
{ {
int start = currentLocation; int start = currentLocation;
TryMoveToAnyOf(WhiteSpaceAndReservedChars.ToArray()); // Keep reading up to invalid character
while(true) {
if (currentLocation == inputLength) break; // Reject end of file
char c = input[currentLocation];
if (0x41 <= (int)c && (int)c <= 0x7A) { // Accpet 0x41-0x7A (A-Z[\]^_`a-z)
currentLocation++;
continue;
}
if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || // Reject whitesapce
c == '=' || c == '\'' || c == '"' || // Reject attributes
c == '<' || c == '>' || c == '/' || c == '?') { // Reject tags
break;
} else {
currentLocation++;
continue; // Accept other character
}
}
if (start == currentLocation) { if (start == currentLocation) {
res = null; res = null;
return false; return false;
} else { } else {
res = GetText(start, currentLocation); res = GetText(start, currentLocation);
// TODO: Check that it is valid XML name
return true; return true;
} }
} }
@ -484,6 +517,18 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} }
} }
string GetText(int start, int end)
{
if (end > currentLocation) throw new Exception("Reading ahead of current location");
if (start == inputLength && end == inputLength) {
return string.Empty;
} else {
return GetCachedString(input.Substring(start, end - start));
}
}
#endregion
/// <summary> /// <summary>
/// Context: any /// Context: any
/// </summary> /// </summary>
@ -906,8 +951,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
const int maxEntityLenght = 12; // The longest build-in one is 10 ("&#x10FFFF;") const int maxEntityLenght = 12; // The longest build-in one is 10 ("&#x10FFFF;")
const int maxTextFragmentSize = 8; const int maxTextFragmentSize = 8;
const int lookAheadLenght = (3 * maxTextFragmentSize) / 2; const int lookAheadLenght = (3 * maxTextFragmentSize) / 2; // More so that we do not get small "what was inserted" fragments
const int backtrackLenght = 4; // 2: get back over "]]" 1: so that we have some data 1: safety
/// <summary> /// <summary>
/// Reads text and optionaly separates it into fragments. /// Reads text and optionaly separates it into fragments.
@ -916,8 +960,6 @@ namespace ICSharpCode.AvalonEdit.XmlParser
/// </summary> /// </summary>
IEnumerable<RawObject> ReadText(RawTextType type) IEnumerable<RawObject> ReadText(RawTextType type)
{ {
// TODO: Rewrite
bool lookahead = false; bool lookahead = false;
while(true) { while(true) {
RawText text; RawText text;
@ -929,18 +971,16 @@ namespace ICSharpCode.AvalonEdit.XmlParser
text.Type = type; text.Type = type;
// Limit the reading to just a few characters // Limit the reading to just a few characters
int realReadingEnd = readingEnd; // (the first character not to be read)
readingEnd = Math.Min(realReadingEnd, currentLocation + maxTextFragmentSize); int fragmentEnd = Math.Min(currentLocation + maxTextFragmentSize, inputLength);
// Look if some futher text has been already processed and align so that // Look if some futher text has been already processed and align so that
// we hit that chache point. It is expensive so it is off for the first run // we hit that chache point. It is expensive so it is off for the first run
if (lookahead) { if (lookahead) {
int nextFragmentIndex = GetStartOfCachedObject<RawText>(t => t.Type == type, currentLocation, lookAheadLenght); int nextFragmentIndex = GetStartOfCachedObject<RawText>(t => t.Type == type, currentLocation, lookAheadLenght);
if (nextFragmentIndex != -1) { if (nextFragmentIndex != -1) {
// Consider adding "aaa]" before cached fragment "]>bbb" fragmentEnd = Math.Min(nextFragmentIndex, inputLength);
// We must not use cache then - so the overshoot acutally makes sense Log("Parsing only text ({0}-{1}) because later text was already processed", currentLocation, fragmentEnd);
readingEnd = Math.Min(realReadingEnd, nextFragmentIndex + backtrackLenght);
Log("Parsing only text ({0}-{1}) because later text was already processed", currentLocation, readingEnd);
} }
} }
lookahead = true; lookahead = true;
@ -950,11 +990,10 @@ namespace ICSharpCode.AvalonEdit.XmlParser
// Try move to the terminator given by the context // Try move to the terminator given by the context
if (type == RawTextType.WhiteSpace) { if (type == RawTextType.WhiteSpace) {
TryMoveToNonWhiteSpace(); TryMoveToNonWhiteSpace(fragmentEnd);
} else if (type == RawTextType.CharacterData) { } else if (type == RawTextType.CharacterData) {
while(true) { while(true) {
TryMoveToAnyOf('<', ']'); if (!TryMoveToAnyOf(new char[] {'<', ']'}, fragmentEnd)) break; // End of fragment
if (IsEndOfFile()) break;
if (TryPeek('<')) break; if (TryPeek('<')) break;
if (TryPeek(']')) { if (TryPeek(']')) {
if (TryPeek("]]>")) { if (TryPeek("]]>")) {
@ -963,50 +1002,57 @@ namespace ICSharpCode.AvalonEdit.XmlParser
TryMoveNext(); TryMoveNext();
continue; continue;
} }
throw new Exception("Infinite loop");
} }
} else if (type == RawTextType.Comment) { } else if (type == RawTextType.Comment) {
// Do not report too many errors
bool errorReported = false;
while(true) { while(true) {
if (TryMoveTo('-')) { if (!TryMoveTo('-', fragmentEnd)) break; // End of fragment
if (TryPeek("-->")) break; if (TryPeek("-->")) break;
if (TryPeek("--")) { if (TryPeek("--") && !errorReported) {
OnSyntaxError(text, currentLocation, currentLocation + 2, "'--' is not allowed in comment"); OnSyntaxError(text, currentLocation, currentLocation + 2, "'--' is not allowed in comment");
} errorReported = true;
TryMoveNext();
} }
if (IsEndOfFile()) break; TryMoveNext();
} }
} else if (type == RawTextType.CData) { } else if (type == RawTextType.CData) {
TryMoveTo("]]>"); while(true) {
// We can not use use TryMoveTo("]]>", fragmentEnd) because it may incorectly accept "]" at the end of fragment
if (!TryMoveTo(']', fragmentEnd)) break; // End of fragment
if (TryPeek("]]>")) break;
TryMoveNext();
}
} else if (type == RawTextType.ProcessingInstruction) { } else if (type == RawTextType.ProcessingInstruction) {
TryMoveTo("?>"); while(true) {
if (!TryMoveTo('?', fragmentEnd)) break; // End of fragment
if (TryPeek("?>")) break;
TryMoveNext();
}
} else if (type == RawTextType.UnknownBang) { } else if (type == RawTextType.UnknownBang) {
TryMoveToAnyOf('<', '>'); TryMoveToAnyOf(new char[] {'<', '>'}, fragmentEnd);
} else { } else {
throw new Exception("Uknown type " + type); throw new Exception("Uknown type " + type);
} }
// Terminal found or real end was reached; // Terminal found or real end was reached;
bool finished = currentLocation < readingEnd || currentLocation == realReadingEnd; bool finished = currentLocation < fragmentEnd || IsEndOfFile();
// Finished reading - restore the old reading end
readingEnd = realReadingEnd;
if (!finished) { if (!finished) {
// We have to continue reading more text fragments // We have to continue reading more text fragments
// We have to backtrack a bit because we just might ended with "]]" and the ">" was cut
int backtrack = currentLocation - backtrackLenght;
// If there is entity reference, make sure the next segment starts with it to prevent framentation // If there is entity reference, make sure the next segment starts with it to prevent framentation
int entitySearchStart = Math.Max(start + 1 /* data for us */, backtrack - maxEntityLenght); int entitySearchStart = Math.Max(start + 1 /* data for us */, currentLocation - maxEntityLenght);
// Note that LastIndexOf works backward int entitySearchLength = currentLocation - entitySearchStart;
int entityIndex = input.LastIndexOf('&', backtrack, backtrack - entitySearchStart); if (entitySearchLength > 0) {
if (entityIndex != -1) { // Note that LastIndexOf works backward
backtrack = entityIndex; int entityIndex = input.LastIndexOf('&', currentLocation - 1, entitySearchLength);
if (entityIndex != -1) {
GoBack(entityIndex);
}
} }
GoBack(Math.Max(start + 1, backtrack)); // Max-just in case
} }
text.Value = GetText(start, currentLocation); text.Value = GetText(start, currentLocation);
text.EndOffset = currentLocation; text.EndOffset = currentLocation;

Loading…
Cancel
Save