fix MimeTypeDetection again

14 years ago · abce2ee65f
5 changed files with 147 additions and 61 deletions
--- a/src/Main/Base/Project/Src/Services/MimeTypeDetection.cs
+++ b/src/Main/Base/Project/Src/Services/MimeTypeDetection.cs
@ -3,9 +3,9 @@
 using System;
 using System.IO;
 using System.Linq;
 using System.Runtime.InteropServices;
 using System.Text;
 using System.Xml;
 namespace ICSharpCode.SharpDevelop
 {
@ -13,12 +13,9 @@ namespace ICSharpCode.SharpDevelop
 	{
 		const int BUFFER_SIZE = 4 * 1024;
-		// Known BOMs
+		public const string Binary = "application/octet-stream";
-		public static readonly byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF };
+		public const string Text = "text/plain";
-		public static readonly byte[] UTF16BE = new byte[] { 0xFE, 0xFF };
+		public const string Xml = "text/xml";
 		public static readonly byte[] UTF16LE = new byte[] { 0xFF, 0xFE };
 		public static readonly byte[] UTF32BE = new byte[] { 0x00, 0x00, 0xFE, 0xFF };
 		public static readonly byte[] UTF32LE = new byte[] { 0xFF, 0xFE, 0x00, 0x00 };
 		[DllImport("urlmon.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = false)]
 		static extern unsafe int FindMimeFromData(
@ -31,47 +28,115 @@ namespace ICSharpCode.SharpDevelop
 			out IntPtr ppwzMimeOut,
 			int dwReserved);
-		
+		public static string FindMimeType(Stream stream)
 		static byte[] DetectAndRemoveBOM(byte[] buffer, out int len)
 		{
-			len = UTF8.Length;
+			StreamReader reader;
-			if (buffer.StartsWith(UTF8))
+			if (stream.Length >= 2) {
-				return buffer.Skip(UTF8.Length).ToArray();
+				int firstByte = stream.ReadByte();
-			len = UTF32BE.Length;
+				int secondByte = stream.ReadByte();
-			if (buffer.StartsWith(UTF32BE))
+				switch ((firstByte << 8) | secondByte) {
-				return buffer.Skip(UTF32BE.Length).ToArray();
+					case 0xfffe: // UTF-16 LE BOM / UTF-32 LE BOM
-			len = UTF32LE.Length;
+					case 0xfeff: // UTF-16 BE BOM
-			if (buffer.StartsWith(UTF32LE))
+						stream.Position -= 2;
-				return buffer.Skip(UTF32LE.Length).ToArray();
+						reader = new StreamReader(stream, detectEncodingFromByteOrderMarks: true);
-			len = UTF16LE.Length;
+						break;
-			if (buffer.StartsWith(UTF16LE))
+					case 0xefbb: // start of UTF-8 BOM
-				return buffer.Skip(UTF16LE.Length).ToArray();
+						if (stream.ReadByte() == 0xbf) {
-			len = UTF16BE.Length;
+							reader = new StreamReader(stream, Encoding.UTF8);
-			if (buffer.StartsWith(UTF16BE))
+							break;
-				return buffer.Skip(UTF16BE.Length).ToArray();
+						} else {
-			len = 0;
+							return Binary;
-			return buffer;
+						}
 					default:
 						if (IsUTF8(stream, (byte)firstByte, (byte)secondByte)) {
 							stream.Position = 0;
 							reader = new StreamReader(stream, Encoding.UTF8);
 							break;
 						} else {
 							byte[] buffer = new byte[BUFFER_SIZE];
 							int length = stream.Read(buffer, 0, BUFFER_SIZE);
 							return FindMimeType(buffer, 0, length);
 						}
 				}
 			} else {
 				return Text;
 			}
 			// Now we got a StreamReader with the correct encoding
 			// Check for XML now
 			try {
 				XmlTextReader xmlReader = new XmlTextReader(reader);
 				xmlReader.XmlResolver = null;
 				xmlReader.MoveToContent();
 				return Xml;
 			} catch (XmlException) {
 				return Text;
 			}
 		}
-		static bool StartsWith(this byte[] buffer, byte[] start)
+		static bool IsUTF8(Stream fs, byte firstByte, byte secondByte)
 		{
-			if (buffer.Length < start.Length)
+			int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB
-				return false;
+			const int ASCII = 0;
-			int i = 0;
+			const int Error = 1;
-			while (i < start.Length && buffer[i] == start[i])
+			const int UTF8  = 2;
-				i++;
+			const int UTF8Sequence = 3;
-			return i >= start.Length;
+			int state = ASCII;
 			int sequenceLength = 0;
 			byte b;
 			for (int i = 0; i < max; i++) {
 				if (i == 0) {
 					b = firstByte;
 				} else if (i == 1) {
 					b = secondByte;
 				} else {
 					b = (byte)fs.ReadByte();
 				}
 				if (b < 0x80) {
 					// normal ASCII character
 					if (state == UTF8Sequence) {
 						state = Error;
 						break;
 					}
 				} else if (b < 0xc0) {
 					// 10xxxxxx : continues UTF8 byte sequence
 					if (state == UTF8Sequence) {
 						--sequenceLength;
 						if (sequenceLength < 0) {
 							state = Error;
 							break;
 						} else if (sequenceLength == 0) {
 							state = UTF8;
 						}
 					} else {
 						state = Error;
 						break;
 					}
 				} else if (b >= 0xc2 && b < 0xf5) {
 					// beginning of byte sequence
 					if (state == UTF8 || state == ASCII) {
 						state = UTF8Sequence;
 						if (b < 0xe0) {
 							sequenceLength = 1; // one more byte following
 						} else if (b < 0xf0) {
 							sequenceLength = 2; // two more bytes following
 						} else {
 							sequenceLength = 3; // three more bytes following
 						}
 					} else {
 						state = Error;
 						break;
 					}
 				} else {
 					// 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)
 					state = Error;
 					break;
 				}
 			}
 			return state != Error;
 		}
 		static unsafe string FindMimeType(byte[] buffer, int offset, int length)
 		{
 			int len;
 			buffer = DetectAndRemoveBOM(buffer, out len);
 			length -= len;
 			offset = (offset < len) ? 0 : offset - len;
 			if (length == 0)
 				return "text/plain";
 			fixed (byte *b = &buffer[offset]) {
 				const int FMFD_ENABLEMIMESNIFFING = 0x00000002;
 				IntPtr mimeout;
@ -89,16 +154,8 @@ namespace ICSharpCode.SharpDevelop
 		{
 			if (buffer == null)
 				throw new ArgumentNullException("buffer");
-			return FindMimeType(buffer, 0, buffer.Length);
+			using (MemoryStream stream = new MemoryStream(buffer))
-		}
+				return FindMimeType(stream);
 		public static string FindMimeType(Stream stream)
 		{
 			if (stream == null)
 				throw new ArgumentNullException("stream");
 			byte[] buffer = new byte[BUFFER_SIZE];
 			stream.Position = 0;
 			return FindMimeType(buffer, 0, stream.Read(buffer, 0, buffer.Length));
 		}
 	}
 }
--- a/src/Main/Base/Test/ICSharpCode.SharpDevelop.Tests.csproj
+++ b/src/Main/Base/Test/ICSharpCode.SharpDevelop.Tests.csproj
@ -195,5 +195,9 @@
      <Name>ICSharpCode.SharpDevelop.Dom</Name>
    </ProjectReference>
  </ItemGroup>
  <ItemGroup>
    <EmbeddedResource Include="mime_utf-16_be_test.txt" />
    <EmbeddedResource Include="mime_utf-16_le_test.txt" />
  </ItemGroup>
  <Import Project="$(MSBuildBinPath)\Microsoft.CSHARP.Targets" />
 </Project>
--- a/src/Main/Base/Test/MimeDetectionTests.cs
+++ b/src/Main/Base/Test/MimeDetectionTests.cs
@ -2,7 +2,9 @@
 // This code is distributed under the GNU LGPL (for details please see \doc\license.txt)
 using System;
 using System.IO;
 using System.Linq;
 using System.Reflection;
 using System.Text;
 using NUnit.Framework;
@ -11,33 +13,56 @@ namespace ICSharpCode.SharpDevelop.Tests
 	[TestFixture]
 	public class MimeTypeDetectionTests
 	{
 		// Known BOMs
 		static readonly byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF };
 		static readonly byte[] UTF16BE = new byte[] { 0xFE, 0xFF };
 		static readonly byte[] UTF16LE = new byte[] { 0xFF, 0xFE };
 //		static readonly byte[] UTF32BE = new byte[] { 0x00, 0x00, 0xFE, 0xFF };
 		static readonly byte[] UTF32LE = new byte[] { 0xFF, 0xFE, 0x00, 0x00 };
 		[Test]
 		public void TextPlain()
 		{
 			// always open empty files with text editor
 			TestMime(new byte[] {}, "text/plain");
 			// UTF-8
-			TestMime(MimeTypeDetection.UTF8, "text/plain");
+			TestMime(UTF8, "text/plain");
 			// UTF-16 Big Endian
-			TestMime(MimeTypeDetection.UTF16BE, "text/plain");
+			TestMime(UTF16BE, "text/plain");
 			// UTF-16 Little Endian
-			TestMime(MimeTypeDetection.UTF16LE, "text/plain");
+			TestMime(UTF16LE, "text/plain");
 			// UTF-32 Big Endian
-			TestMime(MimeTypeDetection.UTF32BE, "text/plain");
+//			TestMime(UTF32BE, "text/plain");
 			// UTF-32 Little Endian
-			TestMime(MimeTypeDetection.UTF32LE, "text/plain");
+			TestMime(UTF32LE, "text/plain");
 		}
 		[Test]
 		public void TextXml()
 		{
-			string xml = "<?xml version=\"1.0\" ?>";
+			string xml = "<?xml version=\"1.0\" ?><My File='Test' />";
 			TestMime(Encoding.Default.GetBytes(xml), "text/xml");
-			TestMime(MimeTypeDetection.UTF8.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
+			TestMime(UTF8.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
-			TestMime(MimeTypeDetection.UTF16BE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
+			TestMime(UTF16BE.Concat(Encoding.BigEndianUnicode.GetBytes(xml)).ToArray(), "text/xml");
-			TestMime(MimeTypeDetection.UTF16LE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
+			TestMime(UTF16LE.Concat(Encoding.Unicode.GetBytes(xml)).ToArray(), "text/xml");
-			TestMime(MimeTypeDetection.UTF32BE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
+//			TestMime(UTF32BE.Concat(new UTF32Encoding(true, true).GetBytes(xml)).ToArray(), "text/xml");
-			TestMime(MimeTypeDetection.UTF32LE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
+			TestMime(UTF32LE.Concat(Encoding.UTF32.GetBytes(xml)).ToArray(), "text/xml");
 		}
 		[Test]
 		public void TestFiles()
 		{
 			TestMime(LoadFile("ICSharpCode.SharpDevelop.Tests.mime_utf-16_be_test.txt"), "text/plain");
 			TestMime(LoadFile("ICSharpCode.SharpDevelop.Tests.mime_utf-16_le_test.txt"), "text/plain");
 		}
 		byte[] LoadFile(string resourceName)
 		{
 			using (Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName)) {
 				byte[] bytes = new byte[stream.Length];
 				stream.Read(bytes, 0, bytes.Length);
 				return bytes;
 			}
 		}
 		void TestMime(byte[] bytes, string expectedMime)
--- a/src/Main/Base/Test/mime_utf-16_be_test.txt
+++ b/src/Main/Base/Test/mime_utf-16_be_test.txt
--- a/src/Main/Base/Test/mime_utf-16_le_test.txt
+++ b/src/Main/Base/Test/mime_utf-16_le_test.txt