判断文件是否为UTF-8编码(以前收集的)

private bool CheckEncoding(string strFileName)
2

{
3

using (FileStream stream = new FileStream(strFileName, FileMode.Open))
4

{
5

byte[] bs = new byte[stream.Length];
6

stream.Read(bs, 0, bs.Length);
7

if (utf8_probability(bs) > 0) return true;
8

else return false;
9

/*
11

if (stream != null && stream.Length >= 2)
12

{
13

//保存文件流的前4个字节
14

byte byte1 = 0;
15

byte byte2 = 0;
16

byte byte3 = 0;
17

byte byte4 = 0;
18

//保存当前Seek位置
19

long origPos = stream.Seek(0, SeekOrigin.Begin);
20

stream.Seek(0, SeekOrigin.Begin);
21

int nByte = stream.ReadByte();
22

byte1 = Convert.ToByte(nByte);
23

byte2 = Convert.ToByte(stream.ReadByte());
24

if (stream.Length >= 3)
25

{
26

byte3 = Convert.ToByte(stream.ReadByte());
27

}
28

if (stream.Length >= 4)
29

{
30

byte4 = Convert.ToByte(stream.ReadByte());
31

}
32

//根据文件流的前4个字节判断Encoding
34

//Unicode {0xFF, 0xFE};
35

//BE-Unicode {0xFE, 0xFF};
36

//UTF8 = {0xEF, 0xBB, 0xBF};
37

if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
38

{
39

targetEncoding = Encoding.BigEndianUnicode;
40

}
41

if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
42

{
43

targetEncoding = Encoding.Unicode;
44

}
45

if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
46

{
47

targetEncoding = Encoding.UTF8;
48

}
49

//恢复Seek位置
50

stream.Seek(origPos, SeekOrigin.Begin);
51

}*/
53

}
54

}
55

private int utf8_probability(byte[] rawtext)
58

{
59

int score = 0;
60

int i, rawtextlen = 0;
61

int goodbytes = 0, asciibytes = 0;
62

// Maybe also use UTF8 Byte Order Mark: EF BB BF
64

// Check to see if characters fit into acceptable ranges
66

rawtextlen = rawtext.Length;
67

for (i = 0; i < rawtextlen; i++)
68

{
69

if ((rawtext[i] & (byte)0x7F) == rawtext[i])
70

{ // One byte
71

asciibytes++;
72

// Ignore ASCII, can throw off count
73

}
74

else
75

{
76

int m_rawInt0 = Convert.ToInt16(rawtext[i]);
77

int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
78

int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
79

if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
81

i + 1 < rawtextlen &&
82

256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
83

{
84

goodbytes += 2;
85

i++;
86

}
87

else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
88

i + 2 < rawtextlen &&
89

256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
90

256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
91

{
92

goodbytes += 3;
93

i += 2;
94

}
95

}
96

}
97

if (asciibytes == rawtextlen) { return 0; }
99

100

score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
101

102

// If not above 98, reduce to zero to prevent coincidental matches
103

// Allows for some (few) bad formed sequences
104

if (score > 98)
105

{
106

return score;
107

}
108

else if (score > 95 && goodbytes > 30)
109

{
110

return score;
111

}
112

else
113

{
114

return 0;
115

}
116

117

}

posted on 2007-07-02 14:17 房客阅读(2001) 评论(2) 收藏举报

刷新页面返回顶部

房客的大杂烩