當前位置:首頁 > web 技術 > 使用Adodb.Stream判斷文件編碼及進行編碼轉換(Unicode,Utf-8,GB2312等
4月02th

使用Adodb.Stream判斷文件編碼及進行編碼轉換(Unicode,Utf-8,GB2312等

robert web 技術 0
Sub FileZM(sFile As String, sCode As String, dFile As String, dCode As String)
'參數:源文件,源文件編碼,目標文件,目標文件編碼。編碼舉例----"gb2312"、"UTF-8"等
Dim ObjStream As Object
Set ObjStream = CreateObject("Adodb.Stream")
With ObjStream
    .Mode = 3         'adModeReadWrite = 3 ' 指示讀/寫權限。
    .Type = 1         'adTypeBinary = 1
    .Open
    .LoadFromFile sFile   '源文件
    .Position = 0
    .Type = 2         'adTypeText = 2
    .Charset = sCode
    sCode = .ReadText '讀取文本到sCode
    
    .Position = 0     ' 這只是定位到文件頭,保留
    .SetEOS           ' 完全重寫不要漏了這個,通過使當前 Position 成為流的結尾來更新 EOS 屬性的值。當前位置後面的所有字節或字符都將被截斷
    .Type = 2         'adTypeText = 2
    .Charset = dCode       '指定輸出編碼
    .WriteText sCode       '寫入指定的文本數據到Adodb.Stream
     .SaveToFile dFile, 2
    .Close
End With
Set ObjStream = Nothing
End Sub

ADODB.Stream組件Charset屬性值

    ANSI_X3.4-1968|iso-8859-1
ANSI_X3.4-1986|iso-8859-1
arabic|iso-8859-6
ascii|iso-8859-1
ASMO-708
Big5
chinese|gb2312
CN-GB|gb2312
cp1256|windows-1256
cp367|iso-8859-1
cp819|iso-8859-1
cp852|ibm852
cp866|ibm866
csASCII|iso-8859-1
csbig5|big5
csEUCKR|ks_c_5601-1987
csEUCPkdFmtJapanese|euc-jp
csGB2312|gb2312
csISO2022JP|_iso-2022-jp$ESC
csISO2022KR|iso-2022-kr
csISO58GB231280|gb2312
csISOLatin1|windows-1252
csISOLatin2|iso-8859-2
csISOLatin4|iso-8859-4
csISOLatin5|iso-8859-9
csISOLatinArabic|iso-8859-6
csISOLatinCyrillic|iso-8859-5
csISOLatinGreek|iso-8859-7
csISOLatinHebrew|iso-8859-8
csKOI8R|koi8-r
csKSC56011987|ks_c_5601-1987
csShiftJIS|shift_jis
csUnicode11UTF7|utf-7
csWindows31J|shift_jis
cyrillic|iso-8859-5
DOS-720
DOS-862
DOS-874
ECMA-114|iso-8859-6
ECMA-118|iso-8859-7
ELOT_928|iso-8859-7
euc-jp
euc-kr
Extended_UNIX_Code_Packed_Format_for_Japanese|euc-jp
GB2312
GBK|gb2312
GB_2312-80|gb2312
greek|iso-8859-7
greek8|iso-8859-7
hebrew|iso-8859-8
hz-gb-2312
IBM367|iso-8859-1
ibm819|iso-8859-1
ibm852
ibm866
iso-2022-jp
iso-2022-kr
iso-8859-1
iso-8859-11|windows-874
iso-8859-2
iso-8859-3
iso-8859-4
iso-8859-5
iso-8859-6
iso-8859-7
iso-8859-8
ISO-8859-8 Visual|iso-8859-8
iso-8859-8-i
iso-8859-9
iso-ir-100|iso-8859-1
iso-ir-101|iso-8859-2
iso-ir-110|iso-8859-4
iso-ir-111|iso-8859-4
iso-ir-126|iso-8859-7
iso-ir-127|iso-8859-6
iso-ir-138|iso-8859-8
iso-ir-144|iso-8859-5
iso-ir-148|iso-8859-9
iso-ir-149|ks_c_5601-1987
iso-ir-58|gb2312
iso-ir-6|iso-8859-1
ISO646-US|iso-8859-1
iso8859-1|iso-8859-1
iso8859-2|iso-8859-2
ISO_646.irv:1991|iso-8859-1
iso_8859-1|iso-8859-1
iso_8859-1:1987|iso-8859-1
iso_8859-2|iso-8859-2
iso_8859-2:1987|iso-8859-2
ISO_8859-4|iso-8859-4
ISO_8859-4:1988|iso-8859-4
ISO_8859-5|iso-8859-5
ISO_8859-5:1988|iso-8859-5
ISO_8859-6|iso-8859-6
ISO_8859-6:1987|iso-8859-6
ISO_8859-7|iso-8859-7
ISO_8859-7:1987|iso-8859-7
ISO_8859-8|iso-8859-8
ISO_8859-8:1988|iso-8859-8
ISO_8859-9|iso-8859-9
ISO_8859-9:1989|iso-8859-9
koi|koi8-r
koi8-r
koi8-ru
korean|ks_c_5601-1987
KSC5601|ks_c_5601-1987
KSC_5601|ks_c_5601-1987
ks_c_5601|ks_c_5601-1987
ks_c_5601-1987
ks_c_5601-1989|ks_c_5601-1987
l1|windows-1252
l2|iso-8859-2
l4|iso-8859-4
l5|iso-8859-9
latin1|iso-8859-1
latin2|iso-8859-2
latin4|iso-8859-4
latin5|iso-8859-9
logical|windows-1255
ms_Kanji|shift_jis
shift-jis|shift_jis
shift_jis
unicode
unicode-1-1-utf-7|utf-7
unicode-1-1-utf-8|utf-8
unicode-2-0-utf-8|utf-8
unicodeFFFE
us|iso-8859-1
us-ascii|iso-8859-1
utf-7
utf-8
visual|iso-8859-8
windows-1250
windows-1251
windows-1252
windows-1253
Windows-1254|iso-8859-9
windows-1255
windows-1256
windows-1257
windows-1258
windows-874
x-ansi|windows-1252
x-cp1250|Windows-1250
x-cp1251|Windows-1251
x-euc|euc-jp
x-euc-jp|euc-jp
x-ms-cp932|shift_jis
x-sjis|shift_jis
x-unicode-2-0-utf-7|utf-7
x-unicode-2-0-utf-8|utf-8
x-user-defined
x-x-big5|big5
_autodetect
_autodetect_all
_autodetect_kr
_iso-2022-jp$ESC
_iso-2022-jp$SIO


'字符集編碼轉換函數,將任意字符集轉換成BSTR

    Function Bytes2Bstr(body,chrset)
 dim objStream
 set objStream = Server.CreateObject("adodb.stream")
 objStream.Type = 1 'adTypeBinary
 objStream.Mode =3 'adModeReadWrite
 objStream.Open
 objStream.Write body
 objStream.Position = 0
 objStream.Type = 2 'adTypeText
 objStream.Charset = chrset
 Bytes2Bstr = objStream.ReadText
 objStream.Close
 set objStream = nothing
End Function

利用ADODB.Stream判斷文件編碼2008-03-14 15:49ANSI無格式定義 EFBB      BF UTF-8 FFFE      UTF-16/UCS-2, little endian FEFF      UTF-16/UCS-2, big endian FFFE 0000 UTF-32/UCS-4, little endian 0000 FEFF UTF-32/UCS-4, big endian

    function checkcode(path)
    set objstream=server.createobject("adodb.stream")
    objstream.Type=1
    objstream.mode=3
    objstream.open
    objstream.Position=0
    objstream.loadfromfile path
    bintou=objstream.read(2)
    If AscB(MidB(bintou,1,1))=&HEF And AscB(MidB(bintou,2,1))=&HBB Then
        checkcode="utf-8"
    ElseIf AscB(MidB(bintou,1,1))=&HFF And AscB(MidB(bintou,2,1))=&HFE Then
        checkcode="unicode"
    Else
        checkcode="gb2312"
    End If
    objstream.close
    set objstream=nothing
end function

UTF8文件有BOM和No BOM兩種,其區別在於BOM多了三個字節(EFBBBF),對比了二進制文件,發現有這個區別,如果是NoBOM的文件就無法通過前兩個字節判斷是否為UTF-8文件。 對於沒有BOM的UTF-8文件,可以通過遍歷判斷是否為UTF-8編碼。遍歷方法可以是二進制的移位判斷或者是正則。


參考出處

目前有 + 人訪問,有 0+ 條評論! 感謝支持!