判断文件是否为UTF-8编码(以前收集的)
1
private bool CheckEncoding(string strFileName)
2
{
3
using (FileStream stream = new FileStream(strFileName, FileMode.Open))
4
{
5
byte[] bs = new byte[stream.Length];
6
stream.Read(bs, 0, bs.Length);
7
if (utf8_probability(bs) > 0) return true;
8
else return false;
9
10
/*
11
if (stream != null && stream.Length >= 2)
12
{
13
//保存文件流的前4个字节
14
byte byte1 = 0;
15
byte byte2 = 0;
16
byte byte3 = 0;
17
byte byte4 = 0;
18
//保存当前Seek位置
19
long origPos = stream.Seek(0, SeekOrigin.Begin);
20
stream.Seek(0, SeekOrigin.Begin);
21
int nByte = stream.ReadByte();
22
byte1 = Convert.ToByte(nByte);
23
byte2 = Convert.ToByte(stream.ReadByte());
24
if (stream.Length >= 3)
25
{
26
byte3 = Convert.ToByte(stream.ReadByte());
27
}
28
if (stream.Length >= 4)
29
{
30
byte4 = Convert.ToByte(stream.ReadByte());
31
}
32
33
//根据文件流的前4个字节判断Encoding
34
//Unicode {0xFF, 0xFE};
35
//BE-Unicode {0xFE, 0xFF};
36
//UTF8 = {0xEF, 0xBB, 0xBF};
37
if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
38
{
39
targetEncoding = Encoding.BigEndianUnicode;
40
}
41
if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
42
{
43
targetEncoding = Encoding.Unicode;
44
}
45
if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
46
{
47
targetEncoding = Encoding.UTF8;
48
}
49
//恢复Seek位置
50
stream.Seek(origPos, SeekOrigin.Begin);
51
52
}*/
53
}
54
}
55
56
57
private int utf8_probability(byte[] rawtext)
58
{
59
int score = 0;
60
int i, rawtextlen = 0;
61
int goodbytes = 0, asciibytes = 0;
62
63
// Maybe also use UTF8 Byte Order Mark: EF BB BF
64
65
// Check to see if characters fit into acceptable ranges
66
rawtextlen = rawtext.Length;
67
for (i = 0; i < rawtextlen; i++)
68
{
69
if ((rawtext[i] & (byte)0x7F) == rawtext[i])
70
{ // One byte
71
asciibytes++;
72
// Ignore ASCII, can throw off count
73
}
74
else
75
{
76
int m_rawInt0 = Convert.ToInt16(rawtext[i]);
77
int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
78
int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
79
80
if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
81
i + 1 < rawtextlen &&
82
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
83
{
84
goodbytes += 2;
85
i++;
86
}
87
else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
88
i + 2 < rawtextlen &&
89
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
90
256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
91
{
92
goodbytes += 3;
93
i += 2;
94
}
95
}
96
}
97
98
if (asciibytes == rawtextlen) { return 0; }
99
100
score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
101
102
// If not above 98, reduce to zero to prevent coincidental matches
103
// Allows for some (few) bad formed sequences
104
if (score > 98)
105
{
106
return score;
107
}
108
else if (score > 95 && goodbytes > 30)
109
{
110
return score;
111
}
112
else
113
{
114
return 0;
115
}
116
117
}
private bool CheckEncoding(string strFileName)2
{3
using (FileStream stream = new FileStream(strFileName, FileMode.Open))4
{5
byte[] bs = new byte[stream.Length];6
stream.Read(bs, 0, bs.Length);7
if (utf8_probability(bs) > 0) return true;8
else return false;9

10
/*11
if (stream != null && stream.Length >= 2)12
{ 13
//保存文件流的前4个字节14
byte byte1 = 0;15
byte byte2 = 0;16
byte byte3 = 0;17
byte byte4 = 0;18
//保存当前Seek位置19
long origPos = stream.Seek(0, SeekOrigin.Begin);20
stream.Seek(0, SeekOrigin.Begin);21
int nByte = stream.ReadByte();22
byte1 = Convert.ToByte(nByte);23
byte2 = Convert.ToByte(stream.ReadByte());24
if (stream.Length >= 3)25
{26
byte3 = Convert.ToByte(stream.ReadByte());27
}28
if (stream.Length >= 4)29
{30
byte4 = Convert.ToByte(stream.ReadByte());31
}32

33
//根据文件流的前4个字节判断Encoding34
//Unicode {0xFF, 0xFE};35
//BE-Unicode {0xFE, 0xFF};36
//UTF8 = {0xEF, 0xBB, 0xBF};37
if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe38
{39
targetEncoding = Encoding.BigEndianUnicode;40
}41
if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode42
{43
targetEncoding = Encoding.Unicode;44
}45
if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF846
{47
targetEncoding = Encoding.UTF8;48
}49
//恢复Seek位置 50
stream.Seek(origPos, SeekOrigin.Begin);51
52
}*/53
}54
}55
56
57
private int utf8_probability(byte[] rawtext)58
{59
int score = 0;60
int i, rawtextlen = 0;61
int goodbytes = 0, asciibytes = 0;62

63
// Maybe also use UTF8 Byte Order Mark: EF BB BF64

65
// Check to see if characters fit into acceptable ranges66
rawtextlen = rawtext.Length;67
for (i = 0; i < rawtextlen; i++)68
{69
if ((rawtext[i] & (byte)0x7F) == rawtext[i])70
{ // One byte71
asciibytes++;72
// Ignore ASCII, can throw off count73
}74
else75
{76
int m_rawInt0 = Convert.ToInt16(rawtext[i]);77
int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);78
int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);79

80
if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes81
i + 1 < rawtextlen &&82
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)83
{84
goodbytes += 2;85
i++;86
}87
else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes88
i + 2 < rawtextlen &&89
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&90
256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)91
{92
goodbytes += 3;93
i += 2;94
}95
}96
}97

98
if (asciibytes == rawtextlen) { return 0; }99

100
score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));101

102
// If not above 98, reduce to zero to prevent coincidental matches103
// Allows for some (few) bad formed sequences104
if (score > 98)105
{106
return score;107
}108
else if (score > 95 && goodbytes > 30)109
{110
return score;111
}112
else113
{114
return 0;115
}116

117
}

浙公网安备 33010602011771号