From 751bd6e28eef0990478a67ed00ba291015f81177 Mon Sep 17 00:00:00 2001 From: Paddy Xu Date: Fri, 14 Jul 2017 18:49:55 +0300 Subject: [PATCH] Fix #39: detect file encoding --- .../QuickLook.Plugin.TextViewer.csproj | 4 + .../SimpleHelpers/FileEncoding.cs | 367 ++++++++++++++++++ .../TextViewerPanel.xaml | 3 +- .../TextViewerPanel.xaml.cs | 15 +- .../packages.config | 2 + 5 files changed, 389 insertions(+), 2 deletions(-) create mode 100644 QuickLook.Plugin/QuickLook.Plugin.TextViewer/SimpleHelpers/FileEncoding.cs diff --git a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/QuickLook.Plugin.TextViewer.csproj b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/QuickLook.Plugin.TextViewer.csproj index 4b791c5..19a887a 100644 --- a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/QuickLook.Plugin.TextViewer.csproj +++ b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/QuickLook.Plugin.TextViewer.csproj @@ -58,6 +58,9 @@ + + ..\..\packages\UDE.CSharp.1.1.0\lib\Ude.dll + @@ -66,6 +69,7 @@ + TextViewerPanel.xaml diff --git a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/SimpleHelpers/FileEncoding.cs b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/SimpleHelpers/FileEncoding.cs new file mode 100644 index 0000000..0111964 --- /dev/null +++ b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/SimpleHelpers/FileEncoding.cs @@ -0,0 +1,367 @@ +#region * License * +/* + SimpleHelpers - FileEncoding + + Copyright © 2014 Khalid Salomão + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the “Software”), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + License: http://www.opensource.org/licenses/mit-license.php + Website: https://github.com/khalidsalomao/SimpleHelpers.Net + */ +#endregion + +using System; +using System.IO; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace QuickLook.Plugin.TextViewer.SimpleHelpers +{ + public class FileEncoding + { + const int DEFAULT_BUFFER_SIZE = 128 * 1024; + + /// + /// Tries to detect the file encoding. + /// + /// The input filename. + /// The default encoding if none was detected. + /// + public static Encoding DetectFileEncoding (string inputFilename, Encoding defaultIfNotDetected = null) + { + using (var stream = new System.IO.FileStream (inputFilename, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.ReadWrite | System.IO.FileShare.Delete, DEFAULT_BUFFER_SIZE)) + { + return DetectFileEncoding (stream) ?? defaultIfNotDetected; + } + } + + /// + /// Tries to detect the file encoding. + /// + /// The input stream. + /// The default encoding if none was detected. + /// + public static Encoding DetectFileEncoding (Stream inputStream, Encoding defaultIfNotDetected = null) + { + var det = new FileEncoding (); + det.Detect (inputStream); + return det.Complete () ?? defaultIfNotDetected; + } + + /// + /// Tries to detect the file encoding. + /// + /// The input data. + /// The start. + /// The count. + /// The default encoding if none was detected. + /// + public static Encoding DetectFileEncoding (byte[] inputData, int start, int count, Encoding defaultIfNotDetected = null) + { + var det = new FileEncoding (); + det.Detect (inputData, start, count); + return det.Complete () ?? defaultIfNotDetected; + } + + /// + /// Tries to load file content with the correct encoding. + /// + /// The filename. + /// The default value if unable to load file content. + /// File content + public static string TryLoadFile (string filename, string defaultValue = "") + { + try + { + if (System.IO.File.Exists (filename)) + { + // enable file encoding detection + var encoding = SimpleHelpers.FileEncoding.DetectFileEncoding (filename); + // Load data based on parameters + return System.IO.File.ReadAllText (filename, encoding); + } + } + catch { } + return defaultValue; + } + + /// + /// Detects if contains textual data. + /// + /// The raw data. + public static bool CheckForTextualData (byte[] rawData) + { + return CheckForTextualData (rawData, 0, rawData.Length); + } + + /// + /// Detects if contains textual data. + /// + /// The raw data. + /// The start. + /// The count. + public static bool CheckForTextualData (byte[] rawData, int start, int count) + { + if (rawData.Length < count || count < 4 || start + 1 >= count) + return true; + + if (CheckForByteOrderMark (rawData, start)) + { + return true; + } + + // http://stackoverflow.com/questions/910873/how-can-i-determine-if-a-file-is-binary-or-text-in-c + // http://www.gnu.org/software/diffutils/manual/html_node/Binary.html + // count the number od null bytes sequences + // considering only sequeces of 2 0s: "\0\0" or control characters below 10 + int nullSequences = 0; + int controlSequences = 0; + for (var i = start + 1; i < count; i++) + { + if (rawData[i - 1] == 0 && rawData[i] == 0) + { + if (++nullSequences > 1) + break; + } + else if (rawData[i - 1] == 0 && rawData[i] < 10) + { + ++controlSequences; + } + } + + // is text if there is no null byte sequences or less than 10% of the buffer has control caracteres + return nullSequences == 0 && (controlSequences <= (rawData.Length / 10)); + } + + /// + /// Detects if data has bytes order mark to indicate its encoding for textual data. + /// + /// The raw data. + /// The start. + /// + private static bool CheckForByteOrderMark (byte[] rawData, int start = 0) + { + if (rawData.Length - start < 4) + return false; + // Detect encoding correctly (from Rick Strahl's blog) + // http://www.west-wind.com/weblog/posts/2007/Nov/28/Detecting-Text-Encoding-for-StreamReader + if (rawData[start] == 0xef && rawData[start + 1] == 0xbb && rawData[start + 2] == 0xbf) + { + // Encoding.UTF8; + return true; + } + else if (rawData[start] == 0xfe && rawData[start + 1] == 0xff) + { + // Encoding.Unicode; + return true; + } + else if (rawData[start] == 0 && rawData[start + 1] == 0 && rawData[start + 2] == 0xfe && rawData[start + 3] == 0xff) + { + // Encoding.UTF32; + return true; + } + else if (rawData[start] == 0x2b && rawData[start + 1] == 0x2f && rawData[start + 2] == 0x76) + { + // Encoding.UTF7; + return true; + } + return false; + } + + Ude.CharsetDetector ude = new Ude.CharsetDetector (); + bool _started = false; + + + /// + /// If the detection has reached a decision. + /// + /// The done. + public bool Done { get; set; } + + /// + /// Detected encoding name. + /// + public string EncodingName { get; set; } + + /// + /// If the data contains textual data. + /// + public bool IsText { get; set; } + + /// + /// If the file or data has any mark indicating encoding information (byte order mark). + /// + public bool HasByteOrderMark { get; set; } + + Dictionary encodingFrequency = new Dictionary (StringComparer.Ordinal); + + /// + /// Resets this instance. + /// + public void Reset () + { + _started = false; + Done = false; + HasByteOrderMark = false; + encodingFrequency.Clear (); + ude.Reset (); + EncodingName = null; + } + + /// + /// Detects the encoding of textual data of the specified input data. + /// Only the stream first 20Mb will be analysed. + /// + /// The input data. + /// Detected encoding name + public string Detect (Stream inputData) + { + return Detect (inputData, 20 * 1024 * 1024); + } + + /// + /// Detects the encoding of textual data of the specified input data. + /// + /// The input data. + /// Size in byte of analysed data, if you want to analysed only a sample. Use 0 to read all stream data. + /// Size of the buffer for the stream read. + /// Detected encoding name + /// bufferSize parameter cannot be 0 or less. + public string Detect (Stream inputData, int maxSize, int bufferSize = 16 * 1024) + { + if (bufferSize <= 0) + throw new ArgumentOutOfRangeException ("bufferSize", "Buffer size cannot be 0 or less."); + int maxIterations = maxSize <= 0 ? Int32.MaxValue : maxSize / bufferSize; + int i = 0; + byte[] buffer = new byte[bufferSize]; + while (i++ < maxIterations) + { + int sz = inputData.Read (buffer, 0, (int)buffer.Length); + if (sz <= 0) + { + break; + } + Detect (buffer, 0, sz); + if (Done) + break; + } + Complete (); + return EncodingName; + } + + /// + /// Detects the encoding of textual data of the specified input data. + /// + /// The input data. + /// The start. + /// The count. + /// Detected encoding name + public string Detect (byte[] inputData, int start, int count) + { + if (Done) + return EncodingName; + if (!_started) + { + Reset (); + _started = true; + if (!CheckForTextualData (inputData, start, count)) + { + IsText = false; + Done = true; + return EncodingName; + } + HasByteOrderMark = CheckForByteOrderMark (inputData, start); + IsText = true; + } + + // execute charset detector + ude.Feed (inputData, start, count); + ude.DataEnd (); + if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) + { + IncrementFrequency (ude.Charset); + Done = true; + return EncodingName; + } + + // singular buffer detection + var singleUde = new Ude.CharsetDetector (); + const int udeFeedSize = 4 * 1024; + int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize; + for (var pos = start; pos < count; pos += step) + { + singleUde.Reset (); + if (pos + step > count) + singleUde.Feed (inputData, pos, count - pos); + else + singleUde.Feed (inputData, pos, step); + singleUde.DataEnd (); + // update encoding frequency + if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset)) + IncrementFrequency (singleUde.Charset); + } + // vote for best encoding + EncodingName = GetCurrentEncoding (); + // update current encoding name + return EncodingName; + } + + /// + /// Finalize detection phase and gets detected encoding name. + /// + /// + public Encoding Complete () + { + Done = true; + ude.DataEnd (); + if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) + { + EncodingName = ude.Charset; + } + // vote for best encoding + EncodingName = GetCurrentEncoding (); + // check result + if (!String.IsNullOrEmpty (EncodingName)) + return Encoding.GetEncoding (EncodingName); + return null; + } + + private void IncrementFrequency (string charset) + { + int currentCount; + encodingFrequency.TryGetValue (charset, out currentCount); + encodingFrequency[charset] = ++currentCount; + } + + private string GetCurrentEncoding () + { + if (encodingFrequency.Count == 0) + return null; + // ASCII should be the last option, since other encodings often has ASCII included... + return encodingFrequency + .OrderByDescending (i => i.Value * (i.Key != ("ASCII") ? 1 : 0)) + .FirstOrDefault ().Key; + } + } +} diff --git a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml index 19ad555..561ae0e 100644 --- a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml +++ b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml @@ -10,7 +10,8 @@ d:DesignWidth="448.79" UseLayoutRounding="True"> - \ No newline at end of file diff --git a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml.cs b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml.cs index 3365a6d..b2baaff 100644 --- a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml.cs +++ b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.xaml.cs @@ -16,8 +16,10 @@ // along with this program. If not, see . using System.IO; +using System.Text; using System.Windows.Controls; using ICSharpCode.AvalonEdit.Highlighting; +using QuickLook.Plugin.TextViewer.SimpleHelpers; namespace QuickLook.Plugin.TextViewer { @@ -35,9 +37,20 @@ namespace QuickLook.Plugin.TextViewer private void LoadFile(string path) { - viewer.Load(path); + using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + viewer.Encoding = DetectEncoding(s); + viewer.Load(path); + } viewer.SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path)); } + + private static Encoding DetectEncoding(Stream s) + { + var det = new FileEncoding(); + det.Detect(s, 1 * 1024 * 1024); + return det.Complete() ?? Encoding.Default; + } } } \ No newline at end of file diff --git a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/packages.config b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/packages.config index 2da5a93..8252bf6 100644 --- a/QuickLook.Plugin/QuickLook.Plugin.TextViewer/packages.config +++ b/QuickLook.Plugin/QuickLook.Plugin.TextViewer/packages.config @@ -2,4 +2,6 @@ + + \ No newline at end of file