基于Cpdetector 檢測文件編碼
使用Cpdetector jar包,提供兩種方式檢測文件編碼,至于選擇哪種 需要根據個人需求,文檔有注釋。依賴antlr-2.7.4.jar,chardet-1.0.jar,jargs-1.0.jar三個jar包。 可以再官網下載 http://cpdetector.sourceforge.net/。
話不多說 附上代碼:
import info.monitorenter.cpdetector.io.ASCIIDetector; import info.monitorenter.cpdetector.io.ByteOrderMarkDetector; import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import info.monitorenter.cpdetector.io.JChardetFacade; import info.monitorenter.cpdetector.io.ParsingDetector; import info.monitorenter.cpdetector.io.UnicodeDetector;import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset;
import org.apache.log4j.Logger;
/**
- <p>
- 獲取流編碼,不保證完全正確,設置檢測策略 isFast為true為快速檢測策略,false為正常檢測
- InputStream 支持mark,則會在檢測后調用reset,外部可重新使用。
- InputStream 流沒有關閉。
- </p>
- <p>
- 如果采用快速檢測編碼方式,最多會掃描8個字節,依次采用的{@link UnicodeDetector},{@link byteOrderMarkDetector},
- {@link JChardetFacade}, {@link ASCIIDetector}檢測。對于一些標準的unicode編碼,適合這個方式或者對耗時敏感的。
- </p>
- <p>
- 采用正常檢測,讀取指定字節數,如果沒有指定,默認讀取全部字節檢測,依次采用的{@link byteOrderMarkDetector},{@link parsingDetector},{@link JChardetFacade}, {@link ASCIIDetector}檢測。
- 字節越多檢測時間越長,正確率較高。
- </p>
- @author WuKong / public class CpdetectorEncoding { - private static final Logger logger = Logger.getLogger(CpdetectorEncoding.class); - /** - <p>
- 獲取流編碼,不保證完全正確,設置檢測策略 isFast為true為快速檢測策略,false為正常檢測
- InputStream 支持mark,則會在檢測后調用reset,外部可重新使用。
- InputStream 流沒有關閉。
- </p>
- <p>
- 如果采用快速檢測編碼方式,最多會掃描8個字節,依次采用的{@link UnicodeDetector},{@link byteOrderMarkDetector},
- {@link JChardetFacade}, {@link ASCIIDetector}檢測。對于一些標準的unicode編碼,適合這個方式或者對耗時敏感的。
- </p>
- <p>
- 采用正常檢測,讀取指定字節數,如果沒有指定,默認讀取全部字節檢測,依次采用的{@link byteOrderMarkDetector},{@link parsingDetector},{@link JChardetFacade}, {@link ASCIIDetector}檢測。
- 字節越多檢測時間越長,正確率較高。
- </p> *
- @param in 輸入流 isFast 是否采用快速檢測編碼方式
- @return Charset The character are now - hopefully - correct。如果為null,沒有檢測出來。
- @throws IOException */ public Charset getEncoding(InputStream buffIn,boolean isFast) throws IOException{ - return getEncoding(buffIn,buffIn.available(),isFast); } - public Charset getFastEncoding(InputStream buffIn) throws IOException{ return getEncoding(buffIn,MAX_READBYTE_FAST,DEFALUT_DETECT_STRATEGY); } 
 
public Charset getEncoding(InputStream in, int size, boolean isFast) throws IOException {
    try {
        java.nio.charset.Charset charset = null;
        int tmpSize = in.available();
        size = size >tmpSize?tmpSize:size;
        //if in support mark method, 
        if(in.markSupported()){
            if(isFast){
                size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
                in.mark(size++);
                charset = getFastDetector().detectCodepage(in, size);
            }else{
                in.mark(size++);
                charset = getDetector().detectCodepage(in, size);
            }
            in.reset();
        }else{
            if(isFast){
                size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
                charset = getFastDetector().detectCodepage(in, size);
            }else{
                charset = getDetector().detectCodepage(in, size);
            }
        }
        return charset;
    }catch(IllegalArgumentException e){
        logger.error(e.getMessage(),e);
        throw e;
    } catch (IOException e) {
        logger.error(e.getMessage(),e);
        throw e;
    }
}
public Charset getEncoding(byte[] byteArr,boolean isFast) throws IOException{
    return getEncoding(byteArr, byteArr.length, isFast);
}
public Charset getFastEncoding(byte[] byteArr) throws IOException{
    return getEncoding(byteArr, MAX_READBYTE_FAST, DEFALUT_DETECT_STRATEGY);
}
public Charset getEncoding(byte[] byteArr, int size,boolean isFast) throws IOException {
    size = byteArr.length>size?size:byteArr.length;
    if(isFast){
        size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
    }
    ByteArrayInputStream byteArrIn = new ByteArrayInputStream(byteArr,0,size);
    BufferedInputStream in = new BufferedInputStream(byteArrIn);
    try {
        Charset charset = null;
        if(isFast){
            charset = getFastDetector().detectCodepage(in, size);
        }else{
            charset = getDetector().detectCodepage(in, size);
        }
        return charset;
    } catch (IllegalArgumentException e) {
        logger.error(e.getMessage(),e);
        throw e;
    } catch (IOException e) {
        logger.error(e.getMessage(),e);
        throw e;
    }
}
private static CodepageDetectorProxy detector =null;
private static CodepageDetectorProxy fastDtector =null;
private static ParsingDetector parsingDetector =  new ParsingDetector(false);
private static ByteOrderMarkDetector byteOrderMarkDetector = new ByteOrderMarkDetector();
//default strategy use fastDtector
private static final boolean DEFALUT_DETECT_STRATEGY = true;
private static final int MAX_READBYTE_FAST = 8; 
private static CodepageDetectorProxy getDetector(){
    if(detector==null){
        detector = CodepageDetectorProxy.getInstance();
         // Add the implementations of info.monitorenter.cpdetector.io.ICodepageDetector: 
        // This one is quick if we deal with unicode codepages:
        detector.add(byteOrderMarkDetector);
        // The first instance delegated to tries to detect the meta charset attribut in html pages.
        detector.add(parsingDetector);
        // This one does the tricks of exclusion and frequency detection, if first implementation is 
        // unsuccessful:
        detector.add(JChardetFacade.getInstance());
        detector.add(ASCIIDetector.getInstance());
    }
    return detector;
}
private static CodepageDetectorProxy getFastDetector(){
    if(fastDtector==null){
        fastDtector = CodepageDetectorProxy.getInstance();
        fastDtector.add(UnicodeDetector.getInstance());
        fastDtector.add(byteOrderMarkDetector); 
        fastDtector.add(JChardetFacade.getInstance());
        fastDtector.add(ASCIIDetector.getInstance());
    }
    return fastDtector;
}
}</pre>