使用lingpipe自然語言處理包進行文本分類

fmms 12年前發布 | 76K 次閱讀 Java 搜索引擎
TrainTClassifier，基于TF/IDF算法的分類器，必須先把要語料庫放到各自所屬的分類文件夾中，比如：與金融相關的文章就放到金融這個文件夾中，我這的根目錄是f:/data/category，訓練完后會生成一個分類器模型 tclassifier，之后其它文本的分類的確定就是通過它。
/**

使用 Lingpipe的TF/IDF分類器訓練語料 

@author laigood 
*/

public class TrainTClassifier {
//訓練語料文件夾

 private static File TDIR = new File("f:\data\category");

 //定義分類

 private static String[] CATEGORIES = { "金融", "軍事", "醫學", "飲食" };
public static void main(String[] args) throws ClassNotFoundException,

     IOException {  

 TfIdfClassifierTrainer<CharSequence> classifier = new TfIdfClassifierTrainer<CharSequence>(  
         new TokenFeatureExtractor(CharacterTokenizerFactory.INSTANCE));  

 // 開始訓練  
 for (int i = 0; i < CATEGORIES.length; i++) {  
     File classDir = new File(TDIR, CATEGORIES[i]);  
     if (!classDir.isDirectory()) {  
         System.out.println("不能找到目錄=" + classDir);  
     }  

     // 訓練器遍歷分類文件夾下的所有文件  
     for (File file : classDir.listFiles()) {  
         String text = Files.readFromFile(file, "utf-8");  
         System.out.println("正在訓練 " + CATEGORIES[i] + file.getName());  
         Classification classification = new Classification(  
                 CATEGORIES[i]);  
         Classified<CharSequence> classified = new Classified<CharSequence>(  
                 text, classification);  
         classifier.handle(classified);  
     }   
 }  



    // 把分類器模型寫到文件上  
    System.out.println("開始生成分類器");  
    String modelFile = "f:\\data\\category\\tclassifier";  
    ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(  
            modelFile));  
    classifier.compileTo(os);  
    os.close();  

    System.out.println("分類器生成完成");  
}  

}</pre> 
TestTClassifier ,測試分類的準確度，測試數據的存放與上面的類似

public class TestTClassifier {

//測試語料的存放目錄  
private static File TDIR = new File("f:\\data\\test");  
private static String[] CATEGORIES = { "金融", "軍事", "醫學", "飲食" };  

public static void main(String[] args) throws ClassNotFoundException {  

    //分類器模型存放地址  
    String modelFile = "f:\\data\\category\\tclassifier";  
    ScoredClassifier<CharSequence> compiledClassifier = null;  
    try {  
        ObjectInputStream oi = new ObjectInputStream(new FileInputStream(  
                modelFile));  
        compiledClassifier = (ScoredClassifier<CharSequence>) oi  
                .readObject();  
        oi.close();  
    } catch (IOException ie) {  
        System.out.println("IO Error: Model file " + modelFile + " missing");  
    }  

    // 遍歷分類目錄中的文件測試分類準確度  
    ConfusionMatrix confMatrix = new ConfusionMatrix(CATEGORIES);  
    NumberFormat nf = NumberFormat.getInstance();  
    nf.setMaximumIntegerDigits(1);  
    nf.setMaximumFractionDigits(3);  
    for (int i = 0; i < CATEGORIES.length; ++i) {  
        File classDir = new File(TDIR, CATEGORIES[i]);  

        //對于每一個文件，通過分類器找出最適合的分類  
        for (File file : classDir.listFiles()) {  
            String text = "";  
            try {  
                text = Files.readFromFile(file, "utf-8");  
            } catch (IOException ie) {  
                System.out.println("不能讀取 " + file.getName());  
            }  
            System.out.println("測試 " + CATEGORIES[i]  
                    + File.separator + file.getName());  

            ScoredClassification classification = compiledClassifier  
                    .classify(text.subSequence(0, text.length()));  
            confMatrix.increment(CATEGORIES[i],  
                    classification.bestCategory());  
            System.out.println("最適合的分類: "  
                    + classification.bestCategory());  
        }   
    }   

    System.out.println("--------------------------------------------");  
    System.out.println("- 結果 ");  
    System.out.println("--------------------------------------------");  
    int[][] imatrix = confMatrix.matrix();  
    StringBuffer sb = new StringBuffer();  
    sb.append(StringTools.fillin("CATEGORY", 10, true, ' '));  
    for (int i = 0; i < CATEGORIES.length; i++)  
        sb.append(StringTools.fillin(CATEGORIES[i], 8, false, ' '));  
    System.out.println(sb.toString());  

    for (int i = 0; i < imatrix.length; i++) {  
        sb = new StringBuffer();  
        sb.append(StringTools.fillin(CATEGORIES[i], 10, true, ' ',  
                10 - CATEGORIES[i].length()));  
        for (int j = 0; j < imatrix.length; j++) {  
            String out = "" + imatrix[i][j];  
            sb.append(StringTools.fillin(out, 8, false, ' ',  
                    8 - out.length()));  
        }  
        System.out.println(sb.toString());  
    }  

    System.out.println("準確度: "  
            + nf.format(confMatrix.totalAccuracy()));  
    System.out.println("總共正確數 : " + confMatrix.totalCorrect());  
    System.out.println("總數：" + confMatrix.totalCount());  
}  

}</pre> 

補上StringTools

/**

A class containing a bunch of string utilities - <br>
a. filterChars: Remove extraneous characters from a string and return a
"clean" string. <br>
b. getSuffix: Given a file name return its extension. <br>
c. fillin: pad or truncate a string to a fixed number of characters. <br>
d. removeAmpersandStrings: remove strings that start with ampersand <br>
e. shaDigest: Compute the 40 byte digest signature of a string <br>
/
public class StringTools {
public static final Locale LOCALE = new Locale("en");
//  -- String limit for StringTools
private static int STRING_TOOLS_LIMIT = 1000000;
// -- pre-compiled RE patterns
private static Pattern extPattern = Pattern.compile("^..{1}quot;);
private static Pattern spacesPattern = Pattern.compile("\s+");
private static Pattern removeAmpersandPattern = Pattern.compile("&[^;]*?;");
/**

Removes non-printable spaces and replaces with a single space

@param in
String with mixed characters
@return String with collapsed spaces and printable characters
*/
public static String filterChars(String in) {
return (filterChars(in, "", ' ', true));
}

public static String filterChars(String in, boolean newLine) {
 return (filterChars(in, "", ' ', newLine));
}
public static String filterChars(String in, String badChars) {
 return (filterChars(in, badChars, ' ', true));
}
public static String filterChars(String in, char replaceChar) {
 return (filterChars(in, "", replaceChar, true));
}
public static String filterChars(String in, String badChars,
   char replaceChar, boolean newLine) {
 if (in == null)
   return "";
 int inLen = in.length();
 if (inLen > STRING_TOOLS_LIMIT)
   return in;
 try {
   // **-- replace non-recognizable characters with spaces
   StringBuffer out = new StringBuffer();
   int badLen = badChars.length();
   for (int i = 0; i < inLen; i++) {

 char ch = in.charAt(i);
 if ((badLen != 0) && removeChar(ch, badChars)) {
   ch = replaceChar;
 } else if (!Character.isDefined(ch) && !Character.isSpaceChar(ch)) {
   ch = replaceChar;
 }
 out.append(ch);

}
// *-- replace new lines with space
   Matcher matcher = null;
   in = out.toString();
// *-- replace consecutive spaces with single space and remove
   // leading/trailing spaces
   in = in.trim();
   matcher = spacesPattern.matcher(in);
   in = matcher.replaceAll(" ");
 } catch (OutOfMemoryError e) {
   return in;
 }
return in;
}
// *-- remove any chars found in the badChars string
private static boolean removeChar(char ch, String badChars) {
 if (badChars.length() == 0)
   return false;
 for (int i = 0; i < badChars.length(); i++) {
   if (ch == badChars.charAt(i))

 return true;

}
 return false;
}
/**

Return the extension of a file, if possible.

@param filename
@return string
*/
public static String getSuffix(String filename) {
if (filename.length() > STRING_TOOLS_LIMIT)
 return ("");
Matcher matcher = extPattern.matcher(filename);
if (!matcher.matches())
 return "";
return (matcher.group(1).toLowerCase(LOCALE));
}

public static String fillin(String in, int len) {
 return fillin(in, len, true, ' ', 3);
}
public static String fillin(String in, int len, char fillinChar) {
 return fillin(in, len, true, fillinChar, 3);
}
public static String fillin(String in, int len, boolean right) {
 return fillin(in, len, right, ' ', 3);
}
public static String fillin(String in, int len, boolean right, char fillinChar) {
 return fillin(in, len, right, fillinChar, 3);
}
/**

Return a string concatenated or padded to the specified length

@param in
string to be truncated or padded
@param len
int length for string
@param right
boolean fillin from the left or right
@param fillinChar
char to pad the string
@param numFills
int number of characters to pad
@return String of specified length
/
public static String fillin(String in, int len, boolean right,
 char fillinChar, int numFills) {
// -- return if string is of required length
int slen = in.length();
if ((slen == len) || (slen > STRING_TOOLS_LIMIT))
 return (in);
// *-- build the fillin string
StringBuffer fillinStb = new StringBuffer();
for (int i = 0; i < numFills; i++)
 fillinStb.append(fillinChar);
String fillinString = fillinStb.toString();
// *-- truncate and pad string if length exceeds required length
if (slen > len) {
 if (right)
   return (in.substring(0, len - numFills) + fillinString);
 else
   return (fillinString + in.substring(slen - len + numFills, slen));
}
// *-- pad string if length is less than required length DatabaseEntry
// dbe = dbt.getNextKey(); String dbkey = new String (dbe.getData());
StringBuffer sb = new StringBuffer();
if (right)
 sb.append(in);
sb.append(fillinString);
if (!right)
 sb.append(in);
return (sb.toString());
}


/**

Remove ampersand strings such as \&nbsp;

@param in
Text string extracted from Web pages
@return String Text string without ampersand strings
*/
public static String removeAmpersandStrings(String in) {
if (in.length() > STRING_TOOLS_LIMIT)
 return (in);
Matcher matcher = removeAmpersandPattern.matcher(in);
return (matcher.replaceAll(""));
}

/**

Escape back slashes

@param in
Text to be escaped
@return String Escaped test
*/
public static String escapeText(String in) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < in.length(); i++) {
 char ch = in.charAt(i);
 if (ch == '\')
   sb.append("\\");
 else
   sb.append(ch);
}
return (sb.toString());
}

/**

Get the SHA signature of a string

@param in
String
@return String SHA signature of in
/
public static String shaDigest(String in) {
StringBuffer out = new StringBuffer();
if ((in == null) || (in.length() == 0))
 return ("");
try {
 // -- create a message digest instance and compute the hash
 // byte array
 MessageDigest md = MessageDigest.getInstance("SHA-1");
 md.reset();
 md.update(in.getBytes());
 byte[] hash = md.digest();
// --- Convert the hash byte array to hexadecimal format, pad
 // hex chars with leading zeroes
 // --- to get a signature of consistent length (40) for all
 // strings.
 for (int i = 0; i < hash.length; i++) {
   out.append(fillin(Integer.toString(0xFF & hash[i], 16), 2, false, '0',

   1));

}
} catch (OutOfMemoryError e) {
 return ("<-------------OUT_OF_MEMORY------------>");
} catch (NoSuchAlgorithmException e) {
 return ("<------SHA digest algorithm not found--->");
}
return (out.toString());
}


/**

Return the string with the first letter upper cased

@param in
@return String
*/
public static String firstLetterUC(String in) {
if ((in == null) || (in.length() == 0))
 return ("");
String out = in.toLowerCase(LOCALE);
String part1 = out.substring(0, 1);
String part2 = out.substring(1, in.length());
return (part1.toUpperCase(LOCALE) + part2.toLowerCase(LOCALE));
}

/**

Return a pattern that can be used to collapse consecutive patterns of the
same type

@param entityTypes
A list of entity types
@return Regex pattern for the entity types
*/
public static Pattern getCollapsePattern(String[] entityTypes) {
Pattern collapsePattern = null;
StringBuffer collapseStr = new StringBuffer();
for (int i = 0; i < entityTypes.length; i++) {
 collapseStr.append("(<\/");
 collapseStr.append(entityTypes[i]);
 collapseStr.append(">\s+");
 collapseStr.append("<");
 collapseStr.append(entityTypes[i]);
 collapseStr.append(">)|");
}
collapsePattern = Pattern.compile(collapseStr.toString().substring(0,
   collapseStr.length() - 1));
return (collapsePattern);
}

/**

return a double that indicates the degree of similarity between two strings
Use the Jaccard similarity, i.e. the ratio of A intersection B to A union B

@param first
string
@param second
string
@return double degreee of similarity
*/
public static double stringSimilarity(String first, String second) {
if ((first == null) || (second == null))
 return (0.0);
String[] a = first.split("\s+");
String[] b = second.split("\s+");
// *-- compute a union b
HashSet<String> aUnionb = new HashSet<String>();
HashSet<String> aTokens = new HashSet<String>();
HashSet<String> bTokens = new HashSet<String>();
for (int i = 0; i < a.length; i++) {
 aUnionb.add(a[i]);
 aTokens.add(a[i]);
}
for (int i = 0; i < b.length; i++) {
 aUnionb.add(b[i]);
 bTokens.add(b[i]);
}
int sizeAunionB = aUnionb.size();
// *-- compute a intersect b
Iterator <String> iter = aUnionb.iterator();
int sizeAinterB = 0;
while (iter != null && iter.hasNext()) {
 String token = (String) iter.next();
 if (aTokens.contains(token) && bTokens.contains(token))
   sizeAinterB++;
}
return ((sizeAunionB > 0) ? (sizeAinterB + 0.0) / sizeAunionB : 0.0);
}


/**

Return the edit distance between the two strings

@param s1
@param s2
@return double
*/
public static double editDistance(String s1, String s2) {
if ((s1.length() == 0) || (s2.length() == 0))
 return (0.0);
return EditDistance.editDistance(s1.subSequence(0, s1.length()), s2
   .subSequence(0, s2.length()), false);
}

/**

Return a string with the contents from the passed reader

@param r Reader
@return String
*/
public static String readerToString(Reader r) {
int charValue;
StringBuffer sb = new StringBuffer(1024);
try {
 while ((charValue = r.read()) != -1)
   sb.append((char) charValue);
} catch (IOException ie) {
 sb.setLength(0);
}
return (sb.toString());
}

/**

Clean up a sentence by consecutive non-alphanumeric chars with a single
non-alphanumeric char

@param in Array of chars
@return String
*/
public static String cleanString(char[] in) {
int len = in.length;
boolean prevOK = true;
for (int i = 0; i < len; i++) {
 if (Character.isLetterOrDigit(in[i]) || Character.isWhitespace(in[i]))
   prevOK = true;
 else {
   if (!prevOK)
 in[i] = ' ';

   prevOK = false;
 }
}
return (new String(in));
}

/**

Return a clean file name

@param filename
@return String
*/
public static String parseFile(String filename) {
return (filterChars(filename, "\/_:."));
}
}</pre> 
轉自：http://blog.csdn.net/laigood12345/article/details/6680201
本文由用戶 fmms 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。
轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！
本文地址：http://www.baiduhome.net/lib/view/open1331791184421.html
Java 搜索引擎
使用lingpipe自然語言處理包進行文本分類

相關經驗

相關資訊

相關文檔

目錄