mirror of https://github.com/icsharpcode/ILSpy.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
208 lines
6.7 KiB
208 lines
6.7 KiB
// Copyright (c) AlphaSierraPapa for the SharpDevelop Team (for details please see \doc\copyright.txt) |
|
// This code is distributed under the GNU LGPL (for details please see \doc\license.txt) |
|
|
|
using System; |
|
using System.IO; |
|
using System.Text; |
|
|
|
namespace ICSharpCode.AvalonEdit.Utils |
|
{ |
|
/// <summary> |
|
/// Class that can open text files with auto-detection of the encoding. |
|
/// </summary> |
|
public static class FileReader |
|
{ |
|
/// <summary> |
|
/// Gets if the given encoding is a Unicode encoding (UTF). |
|
/// </summary> |
|
/// <remarks> |
|
/// Returns true for UTF-7, UTF-8, UTF-16 LE, UTF-16 BE, UTF-32 LE and UTF-32 BE. |
|
/// Returns false for all other encodings. |
|
/// </remarks> |
|
public static bool IsUnicode(Encoding encoding) |
|
{ |
|
if (encoding == null) |
|
throw new ArgumentNullException("encoding"); |
|
switch (encoding.CodePage) { |
|
case 65000: // UTF-7 |
|
case 65001: // UTF-8 |
|
case 1200: // UTF-16 LE |
|
case 1201: // UTF-16 BE |
|
case 12000: // UTF-32 LE |
|
case 12001: // UTF-32 BE |
|
return true; |
|
default: |
|
return false; |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Reads the content of the given stream. |
|
/// </summary> |
|
/// <param name="stream">The stream to read. |
|
/// The stream must support seeking and must be positioned at its beginning.</param> |
|
/// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
|
/// <returns>The file content as string.</returns> |
|
public static string ReadFileContent(Stream stream, Encoding defaultEncoding) |
|
{ |
|
using (StreamReader reader = OpenStream(stream, defaultEncoding)) { |
|
return reader.ReadToEnd(); |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Reads the content of the file. |
|
/// </summary> |
|
/// <param name="fileName">The file name.</param> |
|
/// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
|
/// <returns>The file content as string.</returns> |
|
public static string ReadFileContent(string fileName, Encoding defaultEncoding) |
|
{ |
|
using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { |
|
return ReadFileContent(fs, defaultEncoding); |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Opens the specified file for reading. |
|
/// </summary> |
|
/// <param name="fileName">The file to open.</param> |
|
/// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
|
/// <returns>Returns a StreamReader that reads from the stream. Use |
|
/// <see cref="StreamReader.CurrentEncoding"/> to get the encoding that was used.</returns> |
|
public static StreamReader OpenFile(string fileName, Encoding defaultEncoding) |
|
{ |
|
if (fileName == null) |
|
throw new ArgumentNullException("fileName"); |
|
FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read); |
|
try { |
|
return OpenStream(fs, defaultEncoding); |
|
// don't use finally: the stream must be kept open until the StreamReader closes it |
|
} catch { |
|
fs.Dispose(); |
|
throw; |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Opens the specified stream for reading. |
|
/// </summary> |
|
/// <param name="stream">The stream to open.</param> |
|
/// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
|
/// <returns>Returns a StreamReader that reads from the stream. Use |
|
/// <see cref="StreamReader.CurrentEncoding"/> to get the encoding that was used.</returns> |
|
public static StreamReader OpenStream(Stream stream, Encoding defaultEncoding) |
|
{ |
|
if (stream == null) |
|
throw new ArgumentNullException("stream"); |
|
if (stream.Position != 0) |
|
throw new ArgumentException("stream is not positioned at beginning.", "stream"); |
|
if (defaultEncoding == null) |
|
throw new ArgumentNullException("defaultEncoding"); |
|
|
|
if (stream.Length >= 2) { |
|
// the autodetection of StreamReader is not capable of detecting the difference |
|
// between ISO-8859-1 and UTF-8 without BOM. |
|
int firstByte = stream.ReadByte(); |
|
int secondByte = stream.ReadByte(); |
|
switch ((firstByte << 8) | secondByte) { |
|
case 0x0000: // either UTF-32 Big Endian or a binary file; use StreamReader |
|
case 0xfffe: // Unicode BOM (UTF-16 LE or UTF-32 LE) |
|
case 0xfeff: // UTF-16 BE BOM |
|
case 0xefbb: // start of UTF-8 BOM |
|
// StreamReader autodetection works |
|
stream.Position = 0; |
|
return new StreamReader(stream); |
|
default: |
|
return AutoDetect(stream, (byte)firstByte, (byte)secondByte, defaultEncoding); |
|
} |
|
} else { |
|
if (defaultEncoding != null) { |
|
return new StreamReader(stream, defaultEncoding); |
|
} else { |
|
return new StreamReader(stream); |
|
} |
|
} |
|
} |
|
|
|
static StreamReader AutoDetect(Stream fs, byte firstByte, byte secondByte, Encoding defaultEncoding) |
|
{ |
|
int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB |
|
const int ASCII = 0; |
|
const int Error = 1; |
|
const int UTF8 = 2; |
|
const int UTF8Sequence = 3; |
|
int state = ASCII; |
|
int sequenceLength = 0; |
|
byte b; |
|
for (int i = 0; i < max; i++) { |
|
if (i == 0) { |
|
b = firstByte; |
|
} else if (i == 1) { |
|
b = secondByte; |
|
} else { |
|
b = (byte)fs.ReadByte(); |
|
} |
|
if (b < 0x80) { |
|
// normal ASCII character |
|
if (state == UTF8Sequence) { |
|
state = Error; |
|
break; |
|
} |
|
} else if (b < 0xc0) { |
|
// 10xxxxxx : continues UTF8 byte sequence |
|
if (state == UTF8Sequence) { |
|
--sequenceLength; |
|
if (sequenceLength < 0) { |
|
state = Error; |
|
break; |
|
} else if (sequenceLength == 0) { |
|
state = UTF8; |
|
} |
|
} else { |
|
state = Error; |
|
break; |
|
} |
|
} else if (b >= 0xc2 && b < 0xf5) { |
|
// beginning of byte sequence |
|
if (state == UTF8 || state == ASCII) { |
|
state = UTF8Sequence; |
|
if (b < 0xe0) { |
|
sequenceLength = 1; // one more byte following |
|
} else if (b < 0xf0) { |
|
sequenceLength = 2; // two more bytes following |
|
} else { |
|
sequenceLength = 3; // three more bytes following |
|
} |
|
} else { |
|
state = Error; |
|
break; |
|
} |
|
} else { |
|
// 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629) |
|
state = Error; |
|
break; |
|
} |
|
} |
|
fs.Position = 0; |
|
switch (state) { |
|
case ASCII: |
|
case Error: |
|
// when the file seems to be ASCII or non-UTF8, |
|
// we read it using the user-specified encoding so it is saved again |
|
// using that encoding. |
|
if (IsUnicode(defaultEncoding)) { |
|
// the file is not Unicode, so don't read it using Unicode even if the |
|
// user has choosen Unicode as the default encoding. |
|
|
|
// If we don't do this, SD will end up always adding a Byte Order Mark |
|
// to ASCII files. |
|
defaultEncoding = Encoding.Default; // use system encoding instead |
|
} |
|
return new StreamReader(fs, defaultEncoding); |
|
default: |
|
return new StreamReader(fs); |
|
} |
|
} |
|
} |
|
}
|
|
|