基于Cpdetector 檢測文件編碼

jopen 9年前發布 | 39K 次閱讀常用工具包 Cpdetector

使用Cpdetector jar包，提供兩種方式檢測文件編碼，至于選擇哪種需要根據個人需求，文檔有注釋。依賴antlr-2.7.4.jar，chardet-1.0.jar，jargs-1.0.jar三個jar包。可以再官網下載 http://cpdetector.sourceforge.net/。

話不多說附上代碼：

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.ByteOrderMarkDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import org.apache.log4j.Logger;
/**

<p>
獲取流編碼,不保證完全正確，設置檢測策略 isFast為true為快速檢測策略，false為正常檢測
InputStream 支持mark,則會在檢測后調用reset，外部可重新使用。
InputStream 流沒有關閉。
</p>

<p>
如果采用快速檢測編碼方式,最多會掃描8個字節，依次采用的{@link UnicodeDetector}，{@link byteOrderMarkDetector}，
{@link JChardetFacade}， {@link ASCIIDetector}檢測。對于一些標準的unicode編碼，適合這個方式或者對耗時敏感的。
</p>

<p>
采用正常檢測，讀取指定字節數，如果沒有指定，默認讀取全部字節檢測，依次采用的{@link byteOrderMarkDetector}，{@link parsingDetector}，{@link JChardetFacade}， {@link ASCIIDetector}檢測。
字節越多檢測時間越長，正確率較高。
</p>
@author WuKong
/
public class CpdetectorEncoding {
private static final Logger logger = Logger.getLogger(CpdetectorEncoding.class);
/**

<p>
獲取流編碼,不保證完全正確，設置檢測策略 isFast為true為快速檢測策略，false為正常檢測
InputStream 支持mark,則會在檢測后調用reset，外部可重新使用。
InputStream 流沒有關閉。
</p>

<p>
如果采用快速檢測編碼方式,最多會掃描8個字節，依次采用的{@link UnicodeDetector}，{@link byteOrderMarkDetector}，
{@link JChardetFacade}， {@link ASCIIDetector}檢測。對于一些標準的unicode編碼，適合這個方式或者對耗時敏感的。
</p>

<p>
采用正常檢測，讀取指定字節數，如果沒有指定，默認讀取全部字節檢測，依次采用的{@link byteOrderMarkDetector}，{@link parsingDetector}，{@link JChardetFacade}， {@link ASCIIDetector}檢測。
字節越多檢測時間越長，正確率較高。
</p>
*
@param in 輸入流  isFast 是否采用快速檢測編碼方式
@return Charset The character are now - hopefully - correct。如果為null，沒有檢測出來。
@throws IOException 
*/
public Charset getEncoding(InputStream buffIn,boolean isFast) throws IOException{
return getEncoding(buffIn,buffIn.available(),isFast);
}
public Charset getFastEncoding(InputStream buffIn) throws IOException{
 return getEncoding(buffIn,MAX_READBYTE_FAST,DEFALUT_DETECT_STRATEGY);
}





public Charset getEncoding(InputStream in, int size, boolean isFast) throws IOException {

    try {

        java.nio.charset.Charset charset = null;

        int tmpSize = in.available();
        size = size >tmpSize?tmpSize:size;
        //if in support mark method, 
        if(in.markSupported()){

            if(isFast){

                size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
                in.mark(size++);
                charset = getFastDetector().detectCodepage(in, size);
            }else{

                in.mark(size++);
                charset = getDetector().detectCodepage(in, size);
            }
            in.reset();

        }else{

            if(isFast){

                size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
                charset = getFastDetector().detectCodepage(in, size);
            }else{
                charset = getDetector().detectCodepage(in, size);
            }
        }


        return charset;
    }catch(IllegalArgumentException e){

        logger.error(e.getMessage(),e);
        throw e;
    } catch (IOException e) {

        logger.error(e.getMessage(),e);
        throw e;
    }

}


public Charset getEncoding(byte[] byteArr,boolean isFast) throws IOException{

    return getEncoding(byteArr, byteArr.length, isFast);
}


public Charset getFastEncoding(byte[] byteArr) throws IOException{

    return getEncoding(byteArr, MAX_READBYTE_FAST, DEFALUT_DETECT_STRATEGY);
}


public Charset getEncoding(byte[] byteArr, int size,boolean isFast) throws IOException {

    size = byteArr.length>size?size:byteArr.length;
    if(isFast){
        size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
    }

    ByteArrayInputStream byteArrIn = new ByteArrayInputStream(byteArr,0,size);
    BufferedInputStream in = new BufferedInputStream(byteArrIn);

    try {

        Charset charset = null;
        if(isFast){

            charset = getFastDetector().detectCodepage(in, size);
        }else{

            charset = getDetector().detectCodepage(in, size);
        }

        return charset;
    } catch (IllegalArgumentException e) {

        logger.error(e.getMessage(),e);
        throw e;
    } catch (IOException e) {

        logger.error(e.getMessage(),e);
        throw e;
    }

}



private static CodepageDetectorProxy detector =null;
private static CodepageDetectorProxy fastDtector =null;
private static ParsingDetector parsingDetector =  new ParsingDetector(false);
private static ByteOrderMarkDetector byteOrderMarkDetector = new ByteOrderMarkDetector();

//default strategy use fastDtector
private static final boolean DEFALUT_DETECT_STRATEGY = true;

private static final int MAX_READBYTE_FAST = 8; 

private static CodepageDetectorProxy getDetector(){

    if(detector==null){

        detector = CodepageDetectorProxy.getInstance();
         // Add the implementations of info.monitorenter.cpdetector.io.ICodepageDetector: 
        // This one is quick if we deal with unicode codepages:
        detector.add(byteOrderMarkDetector);
        // The first instance delegated to tries to detect the meta charset attribut in html pages.
        detector.add(parsingDetector);
        // This one does the tricks of exclusion and frequency detection, if first implementation is 
        // unsuccessful:
        detector.add(JChardetFacade.getInstance());
        detector.add(ASCIIDetector.getInstance());
    }

    return detector;
}


private static CodepageDetectorProxy getFastDetector(){

    if(fastDtector==null){

        fastDtector = CodepageDetectorProxy.getInstance();
        fastDtector.add(UnicodeDetector.getInstance());
        fastDtector.add(byteOrderMarkDetector); 
        fastDtector.add(JChardetFacade.getInstance());
        fastDtector.add(ASCIIDetector.getInstance());
    }

    return fastDtector;
}


}</pre>

本文由用戶 jopen 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。

轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。

本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！

本文地址：http://www.baiduhome.net/lib/view/open1421579818062.html

常用工具包 Cpdetector

基于Cpdetector 檢測文件編碼

相關經驗

相關資訊

相關文檔

目錄