Use signed Ude lib

This commit is contained in:
Paddy Xu
2017-07-15 23:03:47 +03:00
parent 61074fea2c
commit 9ca358bd5a
4 changed files with 141 additions and 156 deletions

View File

@@ -1,4 +1,5 @@
#region * License *
#region * License *
/*
SimpleHelpers - FileEncoding
@@ -28,19 +29,49 @@
License: http://www.opensource.org/licenses/mit-license.php
Website: https://github.com/khalidsalomao/SimpleHelpers.Net
*/
#endregion
using System;
using System.IO;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Ude;
namespace QuickLook.Plugin.TextViewer.SimpleHelpers
namespace QuickLook.Plugin.TextViewer
{
public class FileEncoding
{
const int DEFAULT_BUFFER_SIZE = 128 * 1024;
private const int DefaultBufferSize = 128 * 1024;
private readonly Dictionary<string, int> _encodingFrequency =
new Dictionary<string, int>(StringComparer.Ordinal);
private readonly CharsetDetector _ude = new CharsetDetector();
private bool _started;
/// <summary>
/// If the detection has reached a decision.
/// </summary>
/// <value>The done.</value>
public bool Done { get; set; }
/// <summary>
/// Detected encoding name.
/// </summary>
public string EncodingName { get; set; }
/// <summary>
/// If the data contains textual data.
/// </summary>
public bool IsText { get; set; }
/// <summary>
/// If the file or data has any mark indicating encoding information (byte order mark).
/// </summary>
public bool HasByteOrderMark { get; set; }
/// <summary>
/// Tries to detect the file encoding.
@@ -50,7 +81,9 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
/// <returns></returns>
public static Encoding DetectFileEncoding(string inputFilename, Encoding defaultIfNotDetected = null)
{
using (var stream = new System.IO.FileStream (inputFilename, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.ReadWrite | System.IO.FileShare.Delete, DEFAULT_BUFFER_SIZE))
using (var stream = new FileStream(inputFilename, FileMode.Open,
FileAccess.Read, FileShare.ReadWrite | FileShare.Delete,
DefaultBufferSize))
{
return DetectFileEncoding(stream) ?? defaultIfNotDetected;
}
@@ -77,7 +110,8 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
/// <param name="count">The count.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding (byte[] inputData, int start, int count, Encoding defaultIfNotDetected = null)
public static Encoding DetectFileEncoding(byte[] inputData, int start, int count,
Encoding defaultIfNotDetected = null)
{
var det = new FileEncoding();
det.Detect(inputData, start, count);
@@ -94,15 +128,18 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
{
try
{
if (System.IO.File.Exists (filename))
if (File.Exists(filename))
{
// enable file encoding detection
var encoding = SimpleHelpers.FileEncoding.DetectFileEncoding (filename);
var encoding = DetectFileEncoding(filename);
// Load data based on parameters
return System.IO.File.ReadAllText (filename, encoding);
return File.ReadAllText(filename, encoding);
}
}
catch { }
catch
{
// ignored
}
return defaultValue;
}
@@ -127,18 +164,15 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
return true;
if (CheckForByteOrderMark(rawData, start))
{
return true;
}
// http://stackoverflow.com/questions/910873/how-can-i-determine-if-a-file-is-binary-or-text-in-c
// http://www.gnu.org/software/diffutils/manual/html_node/Binary.html
// count the number od null bytes sequences
// considering only sequeces of 2 0s: "\0\0" or control characters below 10
int nullSequences = 0;
int controlSequences = 0;
var nullSequences = 0;
var controlSequences = 0;
for (var i = start + 1; i < count; i++)
{
if (rawData[i - 1] == 0 && rawData[i] == 0)
{
if (++nullSequences > 1)
@@ -148,10 +182,9 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
{
++controlSequences;
}
}
// is text if there is no null byte sequences or less than 10% of the buffer has control caracteres
return nullSequences == 0 && (controlSequences <= (rawData.Length / 10));
return nullSequences == 0 && controlSequences <= rawData.Length / 10;
}
/// <summary>
@@ -167,55 +200,17 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
// Detect encoding correctly (from Rick Strahl's blog)
// http://www.west-wind.com/weblog/posts/2007/Nov/28/Detecting-Text-Encoding-for-StreamReader
if (rawData[start] == 0xef && rawData[start + 1] == 0xbb && rawData[start + 2] == 0xbf)
{
// Encoding.UTF8;
return true;
}
else if (rawData[start] == 0xfe && rawData[start + 1] == 0xff)
{
// Encoding.Unicode;
if (rawData[start] == 0xfe && rawData[start + 1] == 0xff)
return true;
}
else if (rawData[start] == 0 && rawData[start + 1] == 0 && rawData[start + 2] == 0xfe && rawData[start + 3] == 0xff)
{
// Encoding.UTF32;
if (rawData[start] == 0 && rawData[start + 1] == 0 && rawData[start + 2] == 0xfe &&
rawData[start + 3] == 0xff)
return true;
}
else if (rawData[start] == 0x2b && rawData[start + 1] == 0x2f && rawData[start + 2] == 0x76)
{
// Encoding.UTF7;
if (rawData[start] == 0x2b && rawData[start + 1] == 0x2f && rawData[start + 2] == 0x76)
return true;
}
return false;
}
Ude.CharsetDetector ude = new Ude.CharsetDetector ();
bool _started = false;
/// <summary>
/// If the detection has reached a decision.
/// </summary>
/// <value>The done.</value>
public bool Done { get; set; }
/// <summary>
/// Detected encoding name.
/// </summary>
public string EncodingName { get; set; }
/// <summary>
/// If the data contains textual data.
/// </summary>
public bool IsText { get; set; }
/// <summary>
/// If the file or data has any mark indicating encoding information (byte order mark).
/// </summary>
public bool HasByteOrderMark { get; set; }
Dictionary<string, int> encodingFrequency = new Dictionary<string, int> (StringComparer.Ordinal);
/// <summary>
/// Resets this instance.
/// </summary>
@@ -224,44 +219,46 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
_started = false;
Done = false;
HasByteOrderMark = false;
encodingFrequency.Clear ();
ude.Reset ();
_encodingFrequency.Clear();
_ude.Reset();
EncodingName = null;
}
/// <summary>
/// Detects the encoding of textual data of the specified input data.<para/>
/// Only the stream first 20Mb will be analysed.
/// Detects the encoding of textual data of the specified input data.
/// <para />
/// Only the stream first 1Mb will be analysed.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <returns>Detected encoding name</returns>
public string Detect(Stream inputData)
{
return Detect (inputData, 20 * 1024 * 1024);
return Detect(inputData, 1 * 1024 * 1024);
}
/// <summary>
/// Detects the encoding of textual data of the specified input data.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <param name="maxSize">Size in byte of analysed data, if you want to analysed only a sample. Use 0 to read all stream data.</param>
/// <param name="maxSize">
/// Size in byte of analysed data, if you want to analysed only a sample. Use 0 to read all stream
/// data.
/// </param>
/// <param name="bufferSize">Size of the buffer for the stream read.</param>
/// <returns>Detected encoding name</returns>
/// <exception cref="ArgumentOutOfRangeException">bufferSize parameter cannot be 0 or less.</exception>
public string Detect(Stream inputData, int maxSize, int bufferSize = 16 * 1024)
{
if (bufferSize <= 0)
throw new ArgumentOutOfRangeException ("bufferSize", "Buffer size cannot be 0 or less.");
int maxIterations = maxSize <= 0 ? Int32.MaxValue : maxSize / bufferSize;
int i = 0;
byte[] buffer = new byte[bufferSize];
throw new ArgumentOutOfRangeException(nameof(bufferSize), @"Buffer size cannot be 0 or less.");
var maxIterations = maxSize <= 0 ? int.MaxValue : maxSize / bufferSize;
var i = 0;
var buffer = new byte[bufferSize];
while (i++ < maxIterations)
{
int sz = inputData.Read (buffer, 0, (int)buffer.Length);
var sz = inputData.Read(buffer, 0, buffer.Length);
if (sz <= 0)
{
break;
}
Detect(buffer, 0, sz);
if (Done)
break;
@@ -296,19 +293,19 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
}
// execute charset detector
ude.Feed (inputData, start, count);
ude.DataEnd ();
if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
_ude.Feed(inputData, start, count);
_ude.DataEnd();
if (_ude.IsDone() && !string.IsNullOrEmpty(_ude.Charset))
{
IncrementFrequency (ude.Charset);
IncrementFrequency(_ude.Charset);
Done = true;
return EncodingName;
}
// singular buffer detection
var singleUde = new Ude.CharsetDetector ();
var singleUde = new CharsetDetector();
const int udeFeedSize = 4 * 1024;
int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;
var step = count - start < udeFeedSize ? count - start : udeFeedSize;
for (var pos = start; pos < count; pos += step)
{
singleUde.Reset();
@@ -318,7 +315,7 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
singleUde.Feed(inputData, pos, step);
singleUde.DataEnd();
// update encoding frequency
if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset))
if (singleUde.Confidence > 0.3 && !string.IsNullOrEmpty(singleUde.Charset))
IncrementFrequency(singleUde.Charset);
}
// vote for best encoding
@@ -334,15 +331,13 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
public Encoding Complete()
{
Done = true;
ude.DataEnd ();
if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
{
EncodingName = ude.Charset;
}
_ude.DataEnd();
if (_ude.IsDone() && !string.IsNullOrEmpty(_ude.Charset))
EncodingName = _ude.Charset;
// vote for best encoding
EncodingName = GetCurrentEncoding();
// check result
if (!String.IsNullOrEmpty (EncodingName))
if (!string.IsNullOrEmpty(EncodingName))
return Encoding.GetEncoding(EncodingName);
return null;
}
@@ -350,17 +345,17 @@ namespace QuickLook.Plugin.TextViewer.SimpleHelpers
private void IncrementFrequency(string charset)
{
int currentCount;
encodingFrequency.TryGetValue (charset, out currentCount);
encodingFrequency[charset] = ++currentCount;
_encodingFrequency.TryGetValue(charset, out currentCount);
_encodingFrequency[charset] = ++currentCount;
}
private string GetCurrentEncoding()
{
if (encodingFrequency.Count == 0)
if (_encodingFrequency.Count == 0)
return null;
// ASCII should be the last option, since other encodings often has ASCII included...
return encodingFrequency
.OrderByDescending (i => i.Value * (i.Key != ("ASCII") ? 1 : 0))
return _encodingFrequency
.OrderByDescending(i => i.Value * (i.Key != "ASCII" ? 1 : 0))
.FirstOrDefault().Key;
}
}

View File

@@ -58,8 +58,8 @@
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Xaml" />
<Reference Include="Ude, Version=0.1.0.0, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\..\packages\UDE.CSharp.1.1.0\lib\Ude.dll</HintPath>
<Reference Include="Ude, Version=0.1.1.0, Culture=neutral, PublicKeyToken=dd537652db4726a9, processorArchitecture=MSIL">
<HintPath>..\..\packages\Ude.Signed.0.1.1\lib\net40\Ude.dll</HintPath>
</Reference>
<Reference Include="WindowsBase" />
</ItemGroup>
@@ -69,7 +69,7 @@
</Compile>
<Compile Include="Plugin.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="SimpleHelpers\FileEncoding.cs" />
<Compile Include="FileEncoding.cs" />
<Compile Include="TextViewerPanel.xaml.cs">
<DependentUpon>TextViewerPanel.xaml</DependentUpon>
</Compile>

View File

@@ -20,7 +20,6 @@ using System.Text;
using System.Windows.Controls;
using System.Windows.Media;
using ICSharpCode.AvalonEdit.Highlighting;
using QuickLook.Plugin.TextViewer.SimpleHelpers;
namespace QuickLook.Plugin.TextViewer
{
@@ -43,18 +42,11 @@ namespace QuickLook.Plugin.TextViewer
{
using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read))
{
viewer.Encoding = DetectEncoding(s);
viewer.Load(path);
viewer.Encoding = FileEncoding.DetectFileEncoding(s, Encoding.Default);
}
viewer.Load(path);
viewer.SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path));
}
private static Encoding DetectEncoding(Stream s)
{
var det = new FileEncoding();
det.Detect(s, 1 * 1024 * 1024);
return det.Complete() ?? Encoding.Default;
}
}
}

View File

@@ -1,7 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="AvalonEdit" version="5.0.3" targetFramework="net452" />
<package id="SimpleHelpers.FileEncoding" version="1.4.0" targetFramework="net462" />
<package id="UDE.CSharp" version="1.1.0" targetFramework="net462" />
<package id="Ude.Signed" version="0.1.1" targetFramework="net462" />
</packages>