Use signed Ude lib

This commit is contained in:
Paddy Xu
2017-07-15 23:03:47 +03:00
parent 61074fea2c
commit 9ca358bd5a
4 changed files with 141 additions and 156 deletions

View File

@@ -1,4 +1,5 @@
#region * License *
#region * License *
/*
SimpleHelpers - FileEncoding
@@ -28,169 +29,27 @@
License: http://www.opensource.org/licenses/mit-license.php
Website: https://github.com/khalidsalomao/SimpleHelpers.Net
*/
#endregion
using System;
using System.IO;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Ude;
namespace QuickLook.Plugin.TextViewer.SimpleHelpers
namespace QuickLook.Plugin.TextViewer
{
public class FileEncoding
{
const int DEFAULT_BUFFER_SIZE = 128 * 1024;
private const int DefaultBufferSize = 128 * 1024;
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputFilename">The input filename.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding (string inputFilename, Encoding defaultIfNotDetected = null)
{
using (var stream = new System.IO.FileStream (inputFilename, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.ReadWrite | System.IO.FileShare.Delete, DEFAULT_BUFFER_SIZE))
{
return DetectFileEncoding (stream) ?? defaultIfNotDetected;
}
}
private readonly Dictionary<string, int> _encodingFrequency =
new Dictionary<string, int>(StringComparer.Ordinal);
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputStream">The input stream.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding (Stream inputStream, Encoding defaultIfNotDetected = null)
{
var det = new FileEncoding ();
det.Detect (inputStream);
return det.Complete () ?? defaultIfNotDetected;
}
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding (byte[] inputData, int start, int count, Encoding defaultIfNotDetected = null)
{
var det = new FileEncoding ();
det.Detect (inputData, start, count);
return det.Complete () ?? defaultIfNotDetected;
}
/// <summary>
/// Tries to load file content with the correct encoding.
/// </summary>
/// <param name="filename">The filename.</param>
/// <param name="defaultValue">The default value if unable to load file content.</param>
/// <returns>File content</returns>
public static string TryLoadFile (string filename, string defaultValue = "")
{
try
{
if (System.IO.File.Exists (filename))
{
// enable file encoding detection
var encoding = SimpleHelpers.FileEncoding.DetectFileEncoding (filename);
// Load data based on parameters
return System.IO.File.ReadAllText (filename, encoding);
}
}
catch { }
return defaultValue;
}
/// <summary>
/// Detects if contains textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
public static bool CheckForTextualData (byte[] rawData)
{
return CheckForTextualData (rawData, 0, rawData.Length);
}
/// <summary>
/// Detects if contains textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
public static bool CheckForTextualData (byte[] rawData, int start, int count)
{
if (rawData.Length < count || count < 4 || start + 1 >= count)
return true;
if (CheckForByteOrderMark (rawData, start))
{
return true;
}
// http://stackoverflow.com/questions/910873/how-can-i-determine-if-a-file-is-binary-or-text-in-c
// http://www.gnu.org/software/diffutils/manual/html_node/Binary.html
// count the number od null bytes sequences
// considering only sequeces of 2 0s: "\0\0" or control characters below 10
int nullSequences = 0;
int controlSequences = 0;
for (var i = start + 1; i < count; i++)
{
if (rawData[i - 1] == 0 && rawData[i] == 0)
{
if (++nullSequences > 1)
break;
}
else if (rawData[i - 1] == 0 && rawData[i] < 10)
{
++controlSequences;
}
}
// is text if there is no null byte sequences or less than 10% of the buffer has control caracteres
return nullSequences == 0 && (controlSequences <= (rawData.Length / 10));
}
/// <summary>
/// Detects if data has bytes order mark to indicate its encoding for textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
/// <param name="start">The start.</param>
/// <returns></returns>
private static bool CheckForByteOrderMark (byte[] rawData, int start = 0)
{
if (rawData.Length - start < 4)
return false;
// Detect encoding correctly (from Rick Strahl's blog)
// http://www.west-wind.com/weblog/posts/2007/Nov/28/Detecting-Text-Encoding-for-StreamReader
if (rawData[start] == 0xef && rawData[start + 1] == 0xbb && rawData[start + 2] == 0xbf)
{
// Encoding.UTF8;
return true;
}
else if (rawData[start] == 0xfe && rawData[start + 1] == 0xff)
{
// Encoding.Unicode;
return true;
}
else if (rawData[start] == 0 && rawData[start + 1] == 0 && rawData[start + 2] == 0xfe && rawData[start + 3] == 0xff)
{
// Encoding.UTF32;
return true;
}
else if (rawData[start] == 0x2b && rawData[start + 1] == 0x2f && rawData[start + 2] == 0x76)
{
// Encoding.UTF7;
return true;
}
return false;
}
Ude.CharsetDetector ude = new Ude.CharsetDetector ();
bool _started = false;
private readonly CharsetDetector _ude = new CharsetDetector();
private bool _started;
/// <summary>
@@ -214,59 +73,197 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
/// </summary>
public bool HasByteOrderMark { get; set; }
Dictionary<string, int> encodingFrequency = new Dictionary<string, int> (StringComparer.Ordinal);
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputFilename">The input filename.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding(string inputFilename, Encoding defaultIfNotDetected = null)
{
using (var stream = new FileStream(inputFilename, FileMode.Open,
FileAccess.Read, FileShare.ReadWrite | FileShare.Delete,
DefaultBufferSize))
{
return DetectFileEncoding(stream) ?? defaultIfNotDetected;
}
}
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputStream">The input stream.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding(Stream inputStream, Encoding defaultIfNotDetected = null)
{
var det = new FileEncoding();
det.Detect(inputStream);
return det.Complete() ?? defaultIfNotDetected;
}
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding(byte[] inputData, int start, int count,
Encoding defaultIfNotDetected = null)
{
var det = new FileEncoding();
det.Detect(inputData, start, count);
return det.Complete() ?? defaultIfNotDetected;
}
/// <summary>
/// Tries to load file content with the correct encoding.
/// </summary>
/// <param name="filename">The filename.</param>
/// <param name="defaultValue">The default value if unable to load file content.</param>
/// <returns>File content</returns>
public static string TryLoadFile(string filename, string defaultValue = "")
{
try
{
if (File.Exists(filename))
{
// enable file encoding detection
var encoding = DetectFileEncoding(filename);
// Load data based on parameters
return File.ReadAllText(filename, encoding);
}
}
catch
{
// ignored
}
return defaultValue;
}
/// <summary>
/// Detects if contains textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
public static bool CheckForTextualData(byte[] rawData)
{
return CheckForTextualData(rawData, 0, rawData.Length);
}
/// <summary>
/// Detects if contains textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
public static bool CheckForTextualData(byte[] rawData, int start, int count)
{
if (rawData.Length < count || count < 4 || start + 1 >= count)
return true;
if (CheckForByteOrderMark(rawData, start))
return true;
// http://stackoverflow.com/questions/910873/how-can-i-determine-if-a-file-is-binary-or-text-in-c
// http://www.gnu.org/software/diffutils/manual/html_node/Binary.html
// count the number od null bytes sequences
// considering only sequeces of 2 0s: "\0\0" or control characters below 10
var nullSequences = 0;
var controlSequences = 0;
for (var i = start + 1; i < count; i++)
if (rawData[i - 1] == 0 && rawData[i] == 0)
{
if (++nullSequences > 1)
break;
}
else if (rawData[i - 1] == 0 && rawData[i] < 10)
{
++controlSequences;
}
// is text if there is no null byte sequences or less than 10% of the buffer has control caracteres
return nullSequences == 0 && controlSequences <= rawData.Length / 10;
}
/// <summary>
/// Detects if data has bytes order mark to indicate its encoding for textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
/// <param name="start">The start.</param>
/// <returns></returns>
private static bool CheckForByteOrderMark(byte[] rawData, int start = 0)
{
if (rawData.Length - start < 4)
return false;
// Detect encoding correctly (from Rick Strahl's blog)
// http://www.west-wind.com/weblog/posts/2007/Nov/28/Detecting-Text-Encoding-for-StreamReader
if (rawData[start] == 0xef && rawData[start + 1] == 0xbb && rawData[start + 2] == 0xbf)
return true;
if (rawData[start] == 0xfe && rawData[start + 1] == 0xff)
return true;
if (rawData[start] == 0 && rawData[start + 1] == 0 && rawData[start + 2] == 0xfe &&
rawData[start + 3] == 0xff)
return true;
if (rawData[start] == 0x2b && rawData[start + 1] == 0x2f && rawData[start + 2] == 0x76)
return true;
return false;
}
/// <summary>
/// Resets this instance.
/// </summary>
public void Reset ()
public void Reset()
{
_started = false;
Done = false;
HasByteOrderMark = false;
encodingFrequency.Clear ();
ude.Reset ();
_encodingFrequency.Clear();
_ude.Reset();
EncodingName = null;
}
/// <summary>
/// Detects the encoding of textual data of the specified input data.<para/>
/// Only the stream first 20Mb will be analysed.
/// Detects the encoding of textual data of the specified input data.
/// <para />
/// Only the stream first 1Mb will be analysed.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <returns>Detected encoding name</returns>
public string Detect (Stream inputData)
public string Detect(Stream inputData)
{
return Detect (inputData, 20 * 1024 * 1024);
return Detect(inputData, 1 * 1024 * 1024);
}
/// <summary>
/// Detects the encoding of textual data of the specified input data.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <param name="maxSize">Size in byte of analysed data, if you want to analysed only a sample. Use 0 to read all stream data.</param>
/// <param name="maxSize">
/// Size in byte of analysed data, if you want to analysed only a sample. Use 0 to read all stream
/// data.
/// </param>
/// <param name="bufferSize">Size of the buffer for the stream read.</param>
/// <returns>Detected encoding name</returns>
/// <exception cref="ArgumentOutOfRangeException">bufferSize parameter cannot be 0 or less.</exception>
public string Detect (Stream inputData, int maxSize, int bufferSize = 16 * 1024)
public string Detect(Stream inputData, int maxSize, int bufferSize = 16 * 1024)
{
if (bufferSize <= 0)
throw new ArgumentOutOfRangeException ("bufferSize", "Buffer size cannot be 0 or less.");
int maxIterations = maxSize <= 0 ? Int32.MaxValue : maxSize / bufferSize;
int i = 0;
byte[] buffer = new byte[bufferSize];
throw new ArgumentOutOfRangeException(nameof(bufferSize), @"Buffer size cannot be 0 or less.");
var maxIterations = maxSize <= 0 ? int.MaxValue : maxSize / bufferSize;
var i = 0;
var buffer = new byte[bufferSize];
while (i++ < maxIterations)
{
int sz = inputData.Read (buffer, 0, (int)buffer.Length);
var sz = inputData.Read(buffer, 0, buffer.Length);
if (sz <= 0)
{
break;
}
Detect (buffer, 0, sz);
Detect(buffer, 0, sz);
if (Done)
break;
}
Complete ();
Complete();
return EncodingName;
}
@@ -277,52 +274,52 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
/// <returns>Detected encoding name</returns>
public string Detect (byte[] inputData, int start, int count)
public string Detect(byte[] inputData, int start, int count)
{
if (Done)
return EncodingName;
if (!_started)
{
Reset ();
Reset();
_started = true;
if (!CheckForTextualData (inputData, start, count))
if (!CheckForTextualData(inputData, start, count))
{
IsText = false;
Done = true;
return EncodingName;
}
HasByteOrderMark = CheckForByteOrderMark (inputData, start);
HasByteOrderMark = CheckForByteOrderMark(inputData, start);
IsText = true;
}
// execute charset detector
ude.Feed (inputData, start, count);
ude.DataEnd ();
if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
_ude.Feed(inputData, start, count);
_ude.DataEnd();
if (_ude.IsDone() && !string.IsNullOrEmpty(_ude.Charset))
{
IncrementFrequency (ude.Charset);
IncrementFrequency(_ude.Charset);
Done = true;
return EncodingName;
}
// singular buffer detection
var singleUde = new Ude.CharsetDetector ();
var singleUde = new CharsetDetector();
const int udeFeedSize = 4 * 1024;
int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;
var step = count - start < udeFeedSize ? count - start : udeFeedSize;
for (var pos = start; pos < count; pos += step)
{
singleUde.Reset ();
singleUde.Reset();
if (pos + step > count)
singleUde.Feed (inputData, pos, count - pos);
singleUde.Feed(inputData, pos, count - pos);
else
singleUde.Feed (inputData, pos, step);
singleUde.DataEnd ();
singleUde.Feed(inputData, pos, step);
singleUde.DataEnd();
// update encoding frequency
if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset))
IncrementFrequency (singleUde.Charset);
if (singleUde.Confidence > 0.3 && !string.IsNullOrEmpty(singleUde.Charset))
IncrementFrequency(singleUde.Charset);
}
// vote for best encoding
EncodingName = GetCurrentEncoding ();
EncodingName = GetCurrentEncoding();
// update current encoding name
return EncodingName;
}
@@ -331,37 +328,35 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
/// Finalize detection phase and gets detected encoding name.
/// </summary>
/// <returns></returns>
public Encoding Complete ()
public Encoding Complete()
{
Done = true;
ude.DataEnd ();
if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
{
EncodingName = ude.Charset;
}
_ude.DataEnd();
if (_ude.IsDone() && !string.IsNullOrEmpty(_ude.Charset))
EncodingName = _ude.Charset;
// vote for best encoding
EncodingName = GetCurrentEncoding ();
EncodingName = GetCurrentEncoding();
// check result
if (!String.IsNullOrEmpty (EncodingName))
return Encoding.GetEncoding (EncodingName);
if (!string.IsNullOrEmpty(EncodingName))
return Encoding.GetEncoding(EncodingName);
return null;
}
private void IncrementFrequency (string charset)
private void IncrementFrequency(string charset)
{
int currentCount;
encodingFrequency.TryGetValue (charset, out currentCount);
encodingFrequency[charset] = ++currentCount;
_encodingFrequency.TryGetValue(charset, out currentCount);
_encodingFrequency[charset] = ++currentCount;
}
private string GetCurrentEncoding ()
private string GetCurrentEncoding()
{
if (encodingFrequency.Count == 0)
if (_encodingFrequency.Count == 0)
return null;
// ASCII should be the last option, since other encodings often has ASCII included...
return encodingFrequency
.OrderByDescending (i => i.Value * (i.Key != ("ASCII") ? 1 : 0))
.FirstOrDefault ().Key;
return _encodingFrequency
.OrderByDescending(i => i.Value * (i.Key != "ASCII" ? 1 : 0))
.FirstOrDefault().Key;
}
}
}

View File

@@ -58,8 +58,8 @@
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Xaml" />
<Reference Include="Ude, Version=0.1.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\..\packages\UDE.CSharp.1.1.0\lib\Ude.dll</HintPath>
<Reference Include="Ude, Version=0.1.1.0, Culture=neutral, PublicKeyToken=dd537652db4726a9, processorArchitecture=MSIL">
<HintPath>..\..\packages\Ude.Signed.0.1.1\lib\net40\Ude.dll</HintPath>
</Reference>
<Reference Include="WindowsBase" />
</ItemGroup>
@@ -69,7 +69,7 @@
</Compile>
<Compile Include="Plugin.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="SimpleHelpers\FileEncoding.cs" />
<Compile Include="FileEncoding.cs" />
<Compile Include="TextViewerPanel.xaml.cs">
<DependentUpon>TextViewerPanel.xaml</DependentUpon>
</Compile>

View File

@@ -20,7 +20,6 @@ using System.Text;
using System.Windows.Controls;
using System.Windows.Media;
using ICSharpCode.AvalonEdit.Highlighting;
using QuickLook.Plugin.TextViewer.SimpleHelpers;
namespace QuickLook.Plugin.TextViewer
{
@@ -43,18 +42,11 @@ namespace QuickLook.Plugin.TextViewer
{
using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read))
{
viewer.Encoding = DetectEncoding(s);
viewer.Encoding = FileEncoding.DetectFileEncoding(s, Encoding.Default);
}
viewer.Load(path);
}
viewer.SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path));
}
private static Encoding DetectEncoding(Stream s)
{
var det = new FileEncoding();
det.Detect(s, 1 * 1024 * 1024);
return det.Complete() ?? Encoding.Default;
}
}
}

View File

@@ -1,7 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="AvalonEdit" version="5.0.3" targetFramework="net452" />
<package id="SimpleHelpers.FileEncoding" version="1.4.0" targetFramework="net462" />
<package id="UDE.CSharp" version="1.1.0" targetFramework="net462" />
<package id="Ude.Signed" version="0.1.1" targetFramework="net462" />
</packages>