Browse Source

XML Parser: Unified text reading methods; Long text may be separated into fragments

git-svn-id: svn://svn.sharpdevelop.net/sharpdevelop/trunk@4601 1ccf3a8d-04fe-1044-b7c0-cef0b8235c61
shortcuts
David Srbecký 16 years ago
parent
commit
200e481ebb
  1. 14
      src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs
  2. 287
      src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs

14
src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/RawObjects.cs

@ -164,9 +164,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -164,9 +164,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser
this.InsertChildren(this.Children.Count, new RawObject[] {item}.ToList());
}
internal void AddChildren(IList<RawObject> items)
internal void AddChildren(IEnumerable<RawObject> items)
{
this.InsertChildren(this.Children.Count, items);
this.InsertChildren(this.Children.Count, items.ToList());
}
/// <summary>
@ -543,11 +543,21 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -543,11 +543,21 @@ namespace ICSharpCode.AvalonEdit.XmlParser
}
}
public enum RawTextType
{
WhiteSpace,
CharacterData,
Comment,
CData,
DocumentTypeDefinition
}
/// <summary>
/// Whitespace or character data
/// </summary>
public class RawText: RawObject
{
public RawTextType Type { get; set; }
public string Value { get; set; }
public override void UpdateDataFrom(RawObject source)

287
src/Libraries/AvalonEdit/ICSharpCode.AvalonEdit/XmlParser/XmlParser.cs

@ -99,8 +99,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -99,8 +99,9 @@ namespace ICSharpCode.AvalonEdit.XmlParser
/// </summary>
public RawDocument Parse()
{
currentLocation = 0;
input = textDocument.Text;
readingEnd = input.Length;
currentLocation = 0;
foreach(DocumentChangeEventArgs change in changesSinceLastParse) {
// Update offsets of all items
@ -127,29 +128,49 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -127,29 +128,49 @@ namespace ICSharpCode.AvalonEdit.XmlParser
return userDocument;
}
T ReadFromCache<T>(int location) where T: RawObject
bool TryReadFromCacheOrNew<T>(out T res) where T: RawObject, new()
{
return TryReadFromCacheOrNew<T>(x => true, out res);
}
bool TryReadFromCacheOrNew<T>(Predicate<T> conditon, out T res) where T: RawObject, new()
{
RawObject obj = parsedItems.FindFirstSegmentWithStartAfter(location);
while(obj != null && obj.StartOffset == location) {
if (obj is T) {
RawObject obj = parsedItems.FindFirstSegmentWithStartAfter(currentLocation);
while(obj != null && obj.StartOffset == currentLocation) {
if (obj is T && conditon((T)obj)) {
currentLocation += obj.Length;
return (T)obj;
res = (T)obj;
return true;
}
obj = parsedItems.GetNextSegment(obj);
}
return null;
res = new T();
return false;
}
void Log(string text, params object[] pars)
int GetStartOfCachedObject<T>(Predicate<T> conditon, int offset, int lookaheadCount) where T: RawObject
{
System.Diagnostics.Debug.WriteLine(string.Format("XML Parser: " + text, pars));
RawObject obj = parsedItems.FindFirstSegmentWithStartAfter(offset);
while(obj != null && obj.StartOffset <= offset + lookaheadCount) {
if (obj is T && conditon((T)obj)) {
return obj.StartOffset;
}
obj = parsedItems.GetNextSegment(obj);
}
return -1;
}
void LogParsed(RawObject obj)
void OnParsed(RawObject obj)
{
parsedItems.Add(obj);
System.Diagnostics.Debug.WriteLine("XML Parser: Parsed " + obj.ToString());
}
void Log(string text, params object[] pars)
{
System.Diagnostics.Debug.WriteLine(string.Format("XML Parser: " + text, pars));
}
Dictionary<string, string> stringCache = new Dictionary<string, string>();
int stringCacheRequestedCount;
int stringCacheRequestedSize;
@ -180,21 +201,22 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -180,21 +201,22 @@ namespace ICSharpCode.AvalonEdit.XmlParser
}
string input;
int currentLocation;
int readingEnd;
int currentLocation;
bool IsEndOfFile()
{
return currentLocation == input.Length;
return currentLocation == readingEnd;
}
bool HasMoreData()
{
return currentLocation < input.Length;
return currentLocation < readingEnd;
}
void AssertHasMoreData()
{
if (currentLocation == input.Length) {
if (currentLocation == readingEnd) {
throw new Exception("Unexpected end of files");
}
}
@ -210,7 +232,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -210,7 +232,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool TryMoveNext()
{
if (currentLocation == input.Length) return false;
if (currentLocation == readingEnd) return false;
currentLocation++;
return true;
@ -218,7 +240,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -218,7 +240,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool TryRead(char c)
{
if (currentLocation == input.Length) return false;
if (currentLocation == readingEnd) return false;
if (input[currentLocation] == c) {
currentLocation++;
@ -252,39 +274,54 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -252,39 +274,54 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool TryPeek(char c)
{
if (currentLocation == input.Length) return false;
if (currentLocation == readingEnd) return false;
return input[currentLocation] == c;
}
bool TryPeek(string text)
{
if (currentLocation + text.Length > input.Length) return false;
if (currentLocation + text.Length > readingEnd) return false;
return input.Substring(currentLocation, text.Length) == text;
return TryPeek(text[0]) && input.Substring(currentLocation, text.Length) == text;
}
bool TryMoveTo(char c)
{
while(true) {
if (currentLocation == input.Length) return false;
if (currentLocation == readingEnd) return false;
if (input[currentLocation] == c) return true;
currentLocation++;
}
}
bool TryMoveTo(string text)
{
while(true) {
if (!TryMoveTo(text[0])) return false; // End of file
if (TryPeek(text)) return true;
currentLocation++;
}
}
bool TryMoveToAnyOf(params char[] c)
{
while(true) {
if (currentLocation == input.Length) return false;
if (currentLocation == readingEnd) return false;
if (c.Contains(input[currentLocation])) return true;
currentLocation++;
}
}
bool TryMoveToNonWhiteSpace()
{
while (TryPeekWhiteSpace()) TryMoveNext();
return HasMoreData();
}
string GetText(int start, int end)
{
if (start == input.Length && end == input.Length) {
if (start == readingEnd && end == readingEnd) {
return string.Empty;
} else {
return GetCachedString(input.Substring(start, end - start));
@ -296,7 +333,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -296,7 +333,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
bool TryPeekWhiteSpace()
{
if (currentLocation == input.Length) return false;
if (currentLocation == readingEnd) return false;
return WhiteSpaceChars.Contains(input[currentLocation]);
}
@ -312,10 +349,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -312,10 +349,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser
RawDocument ReadDocument()
{
RawDocument doc = ReadFromCache<RawDocument>(currentLocation);
if (doc != null) return doc;
doc = new RawDocument();
RawDocument doc;
if (TryReadFromCacheOrNew(out doc)) return doc;
doc.StartOffset = currentLocation;
while(true) {
@ -324,13 +359,12 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -324,13 +359,12 @@ namespace ICSharpCode.AvalonEdit.XmlParser
} else if (TryPeek('<')) {
doc.AddChild(ReadElementOrTag());
} else {
doc.AddChild(ReadCharacterData());
doc.AddChildren(ReadText(RawTextType.CharacterData));
}
}
doc.EndOffset = currentLocation;
LogParsed(doc);
parsedItems.Add(doc);
OnParsed(doc);
return doc;
}
@ -351,10 +385,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -351,10 +385,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser
{
AssertHasMoreData();
RawElement element = ReadFromCache<RawElement>(currentLocation);
if (element != null) return element;
element = new RawElement();
RawElement element;
if (TryReadFromCacheOrNew(out element)) return element;
element.StartOffset = currentLocation;
// Read start tag
@ -370,18 +402,16 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -370,18 +402,16 @@ namespace ICSharpCode.AvalonEdit.XmlParser
if (content is RawTag && ((RawTag)content).IsEndTag) break;
element.AddChild(content);
} else {
element.AddChild(ReadCharacterData());
element.AddChildren(ReadText(RawTextType.CharacterData));
}
}
}
element.EndOffset = currentLocation;
LogParsed(element);
parsedItems.Add(element);
OnParsed(element);
return element;
}
// Start tag: "<" Name? (RawText+ RawAttribute)* RawText* (">" | "/>")
// End tag: "</" Name? (RawText+ RawAttribute)* RawText* ">"
// P.instr.: "<?" Name? (RawText+ RawAttribute)* RawText* "?>"
@ -393,10 +423,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -393,10 +423,8 @@ namespace ICSharpCode.AvalonEdit.XmlParser
{
AssertHasMoreData();
RawTag tag = ReadFromCache<RawTag>(currentLocation);
if (tag != null) return tag;
tag = new RawTag();
RawTag tag;
if (TryReadFromCacheOrNew(out tag)) return tag;
tag.StartOffset = currentLocation;
@ -415,7 +443,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -415,7 +443,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
// Read attributes for the tag
while(true) {
if (TryPeekWhiteSpace()) {
tag.AddChild(ReadWhiteSpace());
tag.AddChildren(ReadText(RawTextType.WhiteSpace));
}
string bracket;
if (TryReadClosingBracket(out bracket)) {
@ -433,13 +461,13 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -433,13 +461,13 @@ namespace ICSharpCode.AvalonEdit.XmlParser
// Simple tag types
if (tag.IsComment) {
// TODO: Be strict only if the opening bracket is complete
tag.AddChildren(ReadTextUntil("-->").ToList());
tag.AddChildren(ReadText(RawTextType.Comment));
} else if (tag.IsCData) {
// TODO: Be strict only if the opening bracket is complete
tag.AddChildren(ReadTextUntil("]]>").ToList());
tag.AddChildren(ReadText(RawTextType.CData));
} else if (tag.IsDocumentType) {
// TODO: Nested definition
tag.AddChildren(ReadTextUntil(">").ToList());
tag.AddChildren(ReadText(RawTextType.DocumentTypeDefinition));
}
string bracket;
if (TryReadClosingBracket(out bracket)) {
@ -448,8 +476,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -448,8 +476,7 @@ namespace ICSharpCode.AvalonEdit.XmlParser
}
tag.EndOffset = currentLocation;
LogParsed(tag);
parsedItems.Add(tag);
OnParsed(tag);
return tag;
}
@ -525,19 +552,16 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -525,19 +552,16 @@ namespace ICSharpCode.AvalonEdit.XmlParser
{
AssertHasMoreData();
RawAttribute attr = ReadFromCache<RawAttribute>(currentLocation);
if (attr != null) return attr;
attr = new RawAttribute();
RawAttribute attr;
if (TryReadFromCacheOrNew(out attr)) return attr;
attr.StartOffset = currentLocation;
if (HasMoreData()) attr.Name = ReadName();
int checkpoint = currentLocation;
attr.EqualsSign = string.Empty;
if (TryPeekWhiteSpace()) attr.EqualsSign += ReadWhiteSpace().Value;
TryMoveToNonWhiteSpace();
if (TryRead('=')) {
attr.EqualsSign += "=";
if (TryPeekWhiteSpace()) attr.EqualsSign += ReadWhiteSpace().Value;
TryMoveToNonWhiteSpace();
attr.EqualsSign += GetText(checkpoint, currentLocation);
// Read attribute value
int start = currentLocation;
if (TryRead('"')) {
@ -550,81 +574,104 @@ namespace ICSharpCode.AvalonEdit.XmlParser @@ -550,81 +574,104 @@ namespace ICSharpCode.AvalonEdit.XmlParser
attr.Value = GetText(start, currentLocation);
}
} else {
attr.EqualsSign = null;
currentLocation = checkpoint;
}
attr.EndOffset = currentLocation;
parsedItems.Add(attr);
OnParsed(attr);
return attr;
}
RawText ReadWhiteSpace()
{
AssertHasMoreData();
RawText ws = ReadFromCache<RawText>(currentLocation);
if (ws != null) return ws;
ws = new RawText();
ws.StartOffset = currentLocation;
int start = currentLocation;
while(TryPeekWhiteSpace()) TryMoveNext();
ws.Value = GetText(start, currentLocation);
ws.EndOffset = currentLocation;
Debug.Assert(ws.Value.Length > 0);
parsedItems.Add(ws);
return ws;
}
RawText ReadCharacterData()
{
Debug.Assert(HasMoreData());
RawText charData = ReadFromCache<RawText>(currentLocation);
if (charData != null) return charData;
charData = new RawText();
charData.StartOffset = currentLocation;
int start = currentLocation;
TryMoveTo('<');
charData.Value = GetText(start, currentLocation);
charData.EndOffset = currentLocation;
Debug.Assert(charData.Value.Length > 0);
parsedItems.Add(charData);
return charData;
}
const int maxEntityLenght = 12; // 6 for build-in ones
const int maxTextFragmentSize = 8;
const int lookAheadLenght = (3 * maxTextFragmentSize) / 2;
const int backtrackLenght = 4; // 2: get back over "]]" 1: so that we have some data 1: safety
IEnumerable<RawObject> ReadTextUntil(string closingText)
/// <summary>
/// Reads text and optionaly separates it into fragments.
/// It can also return empty set for no appropriate text input
/// </summary>
IEnumerable<RawObject> ReadText(RawTextType type)
{
Debug.Assert(HasMoreData());
RawText charData = ReadFromCache<RawText>(currentLocation);
// TODO: How many return? Ensure the output is same as before
if (charData != null) yield return charData;
charData = new RawText();
charData.StartOffset = currentLocation;
int start = currentLocation;
bool lookahead = false;
while(true) {
if (!TryMoveTo(closingText[0])) break; // End of file
if (TryPeek(closingText)) break; // Match
TryMoveNext();
RawText text;
if (TryReadFromCacheOrNew(t => t.Type == type, out text)) {
// Cached text found
yield return text;
lookahead = true; // In the middle of the text edit
continue; // Read next fragment; the method can handle "no text left"
}
text.Type = type;
// Limit the reading to just a few characters
int realReadingEnd = readingEnd;
readingEnd = Math.Min(realReadingEnd, currentLocation + maxTextFragmentSize);
// Look if some futher text has been already processed and align so that
// we hit that chache point. It is expensive so it is off for the first run
if (lookahead) {
int nextFragmentIndex = GetStartOfCachedObject<RawText>(t => t.Type == type, currentLocation, lookAheadLenght);
if (nextFragmentIndex != -1) {
// Consider adding "aaa]" before cached fragment "]>bbb"
// We must not use cache then - so the overshoot acutally makes sense
readingEnd = nextFragmentIndex + backtrackLenght;
Log("Parsing only text ({0}-{1}) because later text was already processed", currentLocation, readingEnd);
}
}
lookahead = true;
text.StartOffset = currentLocation;
int start = currentLocation;
// Try move to the terminator given by the context
if (type == RawTextType.WhiteSpace) {
TryMoveToNonWhiteSpace();
} else if (type == RawTextType.CharacterData) {
TryMoveTo('<');
} else if (type == RawTextType.Comment) {
TryMoveTo("--");
} else if (type == RawTextType.DocumentTypeDefinition) {
TryMoveTo('>');
} else if (type == RawTextType.CData) {
TryMoveTo("]]>");
} else {
throw new Exception("Uknown type " + type);
}
// Terminal found or real end was reached;
bool finished = currentLocation < readingEnd || currentLocation == realReadingEnd;
// Finished reading - restore the old reading end
readingEnd = realReadingEnd;
if (!finished) {
// We have to continue reading more text fragments
// We have to backtrack a bit because we just might ended with "]]" and the ">" was cut
int backtrack = currentLocation - backtrackLenght;
// If there is entity reference, make sure the next segment starts with it to prevent framentation
int entitySearchStart = Math.Max(start + 1 /* data for us */, backtrack - maxEntityLenght);
int entityIndex = input.LastIndexOf('&', entitySearchStart, backtrack - entitySearchStart);
if (entityIndex != -1) {
backtrack = entityIndex;
}
currentLocation = Math.Max(start + 1, backtrack); // Max-just in case
}
text.Value = GetText(start, currentLocation);
text.EndOffset = currentLocation;
if (text.Value.Length > 0) {
OnParsed(text);
yield return text;
}
if (finished) {
yield break;
}
}
charData.Value = GetText(start, currentLocation);
charData.EndOffset = currentLocation;
Debug.Assert(charData.Value.Length > 0);
parsedItems.Add(charData);
yield return charData;
}
}
}

Loading…
Cancel
Save