基于Cpdetector 檢測文件編碼
使用Cpdetector jar包,提供兩種方式檢測文件編碼,至于選擇哪種 需要根據個人需求,文檔有注釋。依賴antlr-2.7.4.jar,chardet-1.0.jar,jargs-1.0.jar三個jar包。 可以再官網下載 http://cpdetector.sourceforge.net/。
話不多說 附上代碼:
import info.monitorenter.cpdetector.io.ASCIIDetector; import info.monitorenter.cpdetector.io.ByteOrderMarkDetector; import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import info.monitorenter.cpdetector.io.JChardetFacade; import info.monitorenter.cpdetector.io.ParsingDetector; import info.monitorenter.cpdetector.io.UnicodeDetector;import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset;
import org.apache.log4j.Logger;
/**
- <p>
- 獲取流編碼,不保證完全正確,設置檢測策略 isFast為true為快速檢測策略,false為正常檢測
- InputStream 支持mark,則會在檢測后調用reset,外部可重新使用。
- InputStream 流沒有關閉。
- </p>
- <p>
- 如果采用快速檢測編碼方式,最多會掃描8個字節,依次采用的{@link UnicodeDetector},{@link byteOrderMarkDetector},
- {@link JChardetFacade}, {@link ASCIIDetector}檢測。對于一些標準的unicode編碼,適合這個方式或者對耗時敏感的。
- </p>
- <p>
- 采用正常檢測,讀取指定字節數,如果沒有指定,默認讀取全部字節檢測,依次采用的{@link byteOrderMarkDetector},{@link parsingDetector},{@link JChardetFacade}, {@link ASCIIDetector}檢測。
- 字節越多檢測時間越長,正確率較高。
- </p>
@author WuKong / public class CpdetectorEncoding {
private static final Logger logger = Logger.getLogger(CpdetectorEncoding.class);
/**
- <p>
- 獲取流編碼,不保證完全正確,設置檢測策略 isFast為true為快速檢測策略,false為正常檢測
- InputStream 支持mark,則會在檢測后調用reset,外部可重新使用。
- InputStream 流沒有關閉。
- </p>
- <p>
- 如果采用快速檢測編碼方式,最多會掃描8個字節,依次采用的{@link UnicodeDetector},{@link byteOrderMarkDetector},
- {@link JChardetFacade}, {@link ASCIIDetector}檢測。對于一些標準的unicode編碼,適合這個方式或者對耗時敏感的。
- </p>
- <p>
- 采用正常檢測,讀取指定字節數,如果沒有指定,默認讀取全部字節檢測,依次采用的{@link byteOrderMarkDetector},{@link parsingDetector},{@link JChardetFacade}, {@link ASCIIDetector}檢測。
- 字節越多檢測時間越長,正確率較高。
- </p> *
- @param in 輸入流 isFast 是否采用快速檢測編碼方式
- @return Charset The character are now - hopefully - correct。如果為null,沒有檢測出來。
@throws IOException */ public Charset getEncoding(InputStream buffIn,boolean isFast) throws IOException{
return getEncoding(buffIn,buffIn.available(),isFast); }
public Charset getFastEncoding(InputStream buffIn) throws IOException{ return getEncoding(buffIn,MAX_READBYTE_FAST,DEFALUT_DETECT_STRATEGY); }
public Charset getEncoding(InputStream in, int size, boolean isFast) throws IOException {
try {
java.nio.charset.Charset charset = null;
int tmpSize = in.available();
size = size >tmpSize?tmpSize:size;
//if in support mark method,
if(in.markSupported()){
if(isFast){
size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
in.mark(size++);
charset = getFastDetector().detectCodepage(in, size);
}else{
in.mark(size++);
charset = getDetector().detectCodepage(in, size);
}
in.reset();
}else{
if(isFast){
size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
charset = getFastDetector().detectCodepage(in, size);
}else{
charset = getDetector().detectCodepage(in, size);
}
}
return charset;
}catch(IllegalArgumentException e){
logger.error(e.getMessage(),e);
throw e;
} catch (IOException e) {
logger.error(e.getMessage(),e);
throw e;
}
}
public Charset getEncoding(byte[] byteArr,boolean isFast) throws IOException{
return getEncoding(byteArr, byteArr.length, isFast);
}
public Charset getFastEncoding(byte[] byteArr) throws IOException{
return getEncoding(byteArr, MAX_READBYTE_FAST, DEFALUT_DETECT_STRATEGY);
}
public Charset getEncoding(byte[] byteArr, int size,boolean isFast) throws IOException {
size = byteArr.length>size?size:byteArr.length;
if(isFast){
size = size>MAX_READBYTE_FAST?MAX_READBYTE_FAST:size;
}
ByteArrayInputStream byteArrIn = new ByteArrayInputStream(byteArr,0,size);
BufferedInputStream in = new BufferedInputStream(byteArrIn);
try {
Charset charset = null;
if(isFast){
charset = getFastDetector().detectCodepage(in, size);
}else{
charset = getDetector().detectCodepage(in, size);
}
return charset;
} catch (IllegalArgumentException e) {
logger.error(e.getMessage(),e);
throw e;
} catch (IOException e) {
logger.error(e.getMessage(),e);
throw e;
}
}
private static CodepageDetectorProxy detector =null;
private static CodepageDetectorProxy fastDtector =null;
private static ParsingDetector parsingDetector = new ParsingDetector(false);
private static ByteOrderMarkDetector byteOrderMarkDetector = new ByteOrderMarkDetector();
//default strategy use fastDtector
private static final boolean DEFALUT_DETECT_STRATEGY = true;
private static final int MAX_READBYTE_FAST = 8;
private static CodepageDetectorProxy getDetector(){
if(detector==null){
detector = CodepageDetectorProxy.getInstance();
// Add the implementations of info.monitorenter.cpdetector.io.ICodepageDetector:
// This one is quick if we deal with unicode codepages:
detector.add(byteOrderMarkDetector);
// The first instance delegated to tries to detect the meta charset attribut in html pages.
detector.add(parsingDetector);
// This one does the tricks of exclusion and frequency detection, if first implementation is
// unsuccessful:
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
}
return detector;
}
private static CodepageDetectorProxy getFastDetector(){
if(fastDtector==null){
fastDtector = CodepageDetectorProxy.getInstance();
fastDtector.add(UnicodeDetector.getInstance());
fastDtector.add(byteOrderMarkDetector);
fastDtector.add(JChardetFacade.getInstance());
fastDtector.add(ASCIIDetector.getInstance());
}
return fastDtector;
}
}</pre>