Browse Source

fix MimeTypeDetection again

pull/6/merge
Siegfried Pammer 14 years ago
parent
commit
abce2ee65f
  1. 157
      src/Main/Base/Project/Src/Services/MimeTypeDetection.cs
  2. 4
      src/Main/Base/Test/ICSharpCode.SharpDevelop.Tests.csproj
  3. 47
      src/Main/Base/Test/MimeDetectionTests.cs
  4. BIN
      src/Main/Base/Test/mime_utf-16_be_test.txt
  5. BIN
      src/Main/Base/Test/mime_utf-16_le_test.txt

157
src/Main/Base/Project/Src/Services/MimeTypeDetection.cs

@ -3,9 +3,9 @@
using System; using System;
using System.IO; using System.IO;
using System.Linq;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
using System.Text; using System.Text;
using System.Xml;
namespace ICSharpCode.SharpDevelop namespace ICSharpCode.SharpDevelop
{ {
@ -13,12 +13,9 @@ namespace ICSharpCode.SharpDevelop
{ {
const int BUFFER_SIZE = 4 * 1024; const int BUFFER_SIZE = 4 * 1024;
// Known BOMs public const string Binary = "application/octet-stream";
public static readonly byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; public const string Text = "text/plain";
public static readonly byte[] UTF16BE = new byte[] { 0xFE, 0xFF }; public const string Xml = "text/xml";
public static readonly byte[] UTF16LE = new byte[] { 0xFF, 0xFE };
public static readonly byte[] UTF32BE = new byte[] { 0x00, 0x00, 0xFE, 0xFF };
public static readonly byte[] UTF32LE = new byte[] { 0xFF, 0xFE, 0x00, 0x00 };
[DllImport("urlmon.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = false)] [DllImport("urlmon.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = false)]
static extern unsafe int FindMimeFromData( static extern unsafe int FindMimeFromData(
@ -31,47 +28,115 @@ namespace ICSharpCode.SharpDevelop
out IntPtr ppwzMimeOut, out IntPtr ppwzMimeOut,
int dwReserved); int dwReserved);
public static string FindMimeType(Stream stream)
static byte[] DetectAndRemoveBOM(byte[] buffer, out int len)
{ {
len = UTF8.Length; StreamReader reader;
if (buffer.StartsWith(UTF8)) if (stream.Length >= 2) {
return buffer.Skip(UTF8.Length).ToArray(); int firstByte = stream.ReadByte();
len = UTF32BE.Length; int secondByte = stream.ReadByte();
if (buffer.StartsWith(UTF32BE)) switch ((firstByte << 8) | secondByte) {
return buffer.Skip(UTF32BE.Length).ToArray(); case 0xfffe: // UTF-16 LE BOM / UTF-32 LE BOM
len = UTF32LE.Length; case 0xfeff: // UTF-16 BE BOM
if (buffer.StartsWith(UTF32LE)) stream.Position -= 2;
return buffer.Skip(UTF32LE.Length).ToArray(); reader = new StreamReader(stream, detectEncodingFromByteOrderMarks: true);
len = UTF16LE.Length; break;
if (buffer.StartsWith(UTF16LE)) case 0xefbb: // start of UTF-8 BOM
return buffer.Skip(UTF16LE.Length).ToArray(); if (stream.ReadByte() == 0xbf) {
len = UTF16BE.Length; reader = new StreamReader(stream, Encoding.UTF8);
if (buffer.StartsWith(UTF16BE)) break;
return buffer.Skip(UTF16BE.Length).ToArray(); } else {
len = 0; return Binary;
return buffer; }
default:
if (IsUTF8(stream, (byte)firstByte, (byte)secondByte)) {
stream.Position = 0;
reader = new StreamReader(stream, Encoding.UTF8);
break;
} else {
byte[] buffer = new byte[BUFFER_SIZE];
int length = stream.Read(buffer, 0, BUFFER_SIZE);
return FindMimeType(buffer, 0, length);
}
}
} else {
return Text;
}
// Now we got a StreamReader with the correct encoding
// Check for XML now
try {
XmlTextReader xmlReader = new XmlTextReader(reader);
xmlReader.XmlResolver = null;
xmlReader.MoveToContent();
return Xml;
} catch (XmlException) {
return Text;
}
} }
static bool StartsWith(this byte[] buffer, byte[] start) static bool IsUTF8(Stream fs, byte firstByte, byte secondByte)
{ {
if (buffer.Length < start.Length) int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB
return false; const int ASCII = 0;
int i = 0; const int Error = 1;
while (i < start.Length && buffer[i] == start[i]) const int UTF8 = 2;
i++; const int UTF8Sequence = 3;
return i >= start.Length; int state = ASCII;
int sequenceLength = 0;
byte b;
for (int i = 0; i < max; i++) {
if (i == 0) {
b = firstByte;
} else if (i == 1) {
b = secondByte;
} else {
b = (byte)fs.ReadByte();
}
if (b < 0x80) {
// normal ASCII character
if (state == UTF8Sequence) {
state = Error;
break;
}
} else if (b < 0xc0) {
// 10xxxxxx : continues UTF8 byte sequence
if (state == UTF8Sequence) {
--sequenceLength;
if (sequenceLength < 0) {
state = Error;
break;
} else if (sequenceLength == 0) {
state = UTF8;
}
} else {
state = Error;
break;
}
} else if (b >= 0xc2 && b < 0xf5) {
// beginning of byte sequence
if (state == UTF8 || state == ASCII) {
state = UTF8Sequence;
if (b < 0xe0) {
sequenceLength = 1; // one more byte following
} else if (b < 0xf0) {
sequenceLength = 2; // two more bytes following
} else {
sequenceLength = 3; // three more bytes following
}
} else {
state = Error;
break;
}
} else {
// 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)
state = Error;
break;
}
}
return state != Error;
} }
static unsafe string FindMimeType(byte[] buffer, int offset, int length) static unsafe string FindMimeType(byte[] buffer, int offset, int length)
{ {
int len;
buffer = DetectAndRemoveBOM(buffer, out len);
length -= len;
offset = (offset < len) ? 0 : offset - len;
if (length == 0)
return "text/plain";
fixed (byte *b = &buffer[offset]) { fixed (byte *b = &buffer[offset]) {
const int FMFD_ENABLEMIMESNIFFING = 0x00000002; const int FMFD_ENABLEMIMESNIFFING = 0x00000002;
IntPtr mimeout; IntPtr mimeout;
@ -89,16 +154,8 @@ namespace ICSharpCode.SharpDevelop
{ {
if (buffer == null) if (buffer == null)
throw new ArgumentNullException("buffer"); throw new ArgumentNullException("buffer");
return FindMimeType(buffer, 0, buffer.Length); using (MemoryStream stream = new MemoryStream(buffer))
} return FindMimeType(stream);
public static string FindMimeType(Stream stream)
{
if (stream == null)
throw new ArgumentNullException("stream");
byte[] buffer = new byte[BUFFER_SIZE];
stream.Position = 0;
return FindMimeType(buffer, 0, stream.Read(buffer, 0, buffer.Length));
} }
} }
} }

4
src/Main/Base/Test/ICSharpCode.SharpDevelop.Tests.csproj

@ -195,5 +195,9 @@
<Name>ICSharpCode.SharpDevelop.Dom</Name> <Name>ICSharpCode.SharpDevelop.Dom</Name>
</ProjectReference> </ProjectReference>
</ItemGroup> </ItemGroup>
<ItemGroup>
<EmbeddedResource Include="mime_utf-16_be_test.txt" />
<EmbeddedResource Include="mime_utf-16_le_test.txt" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSHARP.Targets" /> <Import Project="$(MSBuildBinPath)\Microsoft.CSHARP.Targets" />
</Project> </Project>

47
src/Main/Base/Test/MimeDetectionTests.cs

@ -2,7 +2,9 @@
// This code is distributed under the GNU LGPL (for details please see \doc\license.txt) // This code is distributed under the GNU LGPL (for details please see \doc\license.txt)
using System; using System;
using System.IO;
using System.Linq; using System.Linq;
using System.Reflection;
using System.Text; using System.Text;
using NUnit.Framework; using NUnit.Framework;
@ -11,33 +13,56 @@ namespace ICSharpCode.SharpDevelop.Tests
[TestFixture] [TestFixture]
public class MimeTypeDetectionTests public class MimeTypeDetectionTests
{ {
// Known BOMs
static readonly byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF };
static readonly byte[] UTF16BE = new byte[] { 0xFE, 0xFF };
static readonly byte[] UTF16LE = new byte[] { 0xFF, 0xFE };
// static readonly byte[] UTF32BE = new byte[] { 0x00, 0x00, 0xFE, 0xFF };
static readonly byte[] UTF32LE = new byte[] { 0xFF, 0xFE, 0x00, 0x00 };
[Test] [Test]
public void TextPlain() public void TextPlain()
{ {
// always open empty files with text editor // always open empty files with text editor
TestMime(new byte[] {}, "text/plain"); TestMime(new byte[] {}, "text/plain");
// UTF-8 // UTF-8
TestMime(MimeTypeDetection.UTF8, "text/plain"); TestMime(UTF8, "text/plain");
// UTF-16 Big Endian // UTF-16 Big Endian
TestMime(MimeTypeDetection.UTF16BE, "text/plain"); TestMime(UTF16BE, "text/plain");
// UTF-16 Little Endian // UTF-16 Little Endian
TestMime(MimeTypeDetection.UTF16LE, "text/plain"); TestMime(UTF16LE, "text/plain");
// UTF-32 Big Endian // UTF-32 Big Endian
TestMime(MimeTypeDetection.UTF32BE, "text/plain"); // TestMime(UTF32BE, "text/plain");
// UTF-32 Little Endian // UTF-32 Little Endian
TestMime(MimeTypeDetection.UTF32LE, "text/plain"); TestMime(UTF32LE, "text/plain");
} }
[Test] [Test]
public void TextXml() public void TextXml()
{ {
string xml = "<?xml version=\"1.0\" ?>"; string xml = "<?xml version=\"1.0\" ?><My File='Test' />";
TestMime(Encoding.Default.GetBytes(xml), "text/xml"); TestMime(Encoding.Default.GetBytes(xml), "text/xml");
TestMime(MimeTypeDetection.UTF8.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml"); TestMime(UTF8.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF16BE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml"); TestMime(UTF16BE.Concat(Encoding.BigEndianUnicode.GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF16LE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml"); TestMime(UTF16LE.Concat(Encoding.Unicode.GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF32BE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml"); // TestMime(UTF32BE.Concat(new UTF32Encoding(true, true).GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF32LE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml"); TestMime(UTF32LE.Concat(Encoding.UTF32.GetBytes(xml)).ToArray(), "text/xml");
}
[Test]
public void TestFiles()
{
TestMime(LoadFile("ICSharpCode.SharpDevelop.Tests.mime_utf-16_be_test.txt"), "text/plain");
TestMime(LoadFile("ICSharpCode.SharpDevelop.Tests.mime_utf-16_le_test.txt"), "text/plain");
}
byte[] LoadFile(string resourceName)
{
using (Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName)) {
byte[] bytes = new byte[stream.Length];
stream.Read(bytes, 0, bytes.Length);
return bytes;
}
} }
void TestMime(byte[] bytes, string expectedMime) void TestMime(byte[] bytes, string expectedMime)

BIN
src/Main/Base/Test/mime_utf-16_be_test.txt

Binary file not shown.

BIN
src/Main/Base/Test/mime_utf-16_le_test.txt

Binary file not shown.
Loading…
Cancel
Save