使用canopy生成和k-means聚類對新聞進行聚類

htae2565 9年前發布 | 16K 次閱讀算法

來自： http://blog.csdn.net/u012965373/article/details/50754420

/**

@author YangXin
@info 使用canopy生成和k-means聚類對新聞進行聚類
*/
package unitNine;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.DictionaryVectorizer;
import org.apache.mahout.vectorizer.DocumentProcessor;
import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
public class ReutersToSparseVectors {
   public static void main(String args[]) throws Exception {

     int minSupport = 5;
     int minDf = 5;
     int maxDFPercent = 95;
     int maxNGramSize = 1;
     float minLLRValue = 50;
     int reduceTasks = 1;
     int chunkSize = 200;
     int norm = 2;
     boolean sequentialAccessOutput = true;

     String inputDir = "inputDir";

     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);

     String outputDir = "reuters";
     HadoopUtil.delete(conf, new Path(outputDir));
     Path tokenizedPath = new Path(outputDir,
         DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
     MyAnalyzer analyzer = new MyAnalyzer();
     DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass()
         .asSubclass(Analyzer.class), tokenizedPath, conf);

     DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
       new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,
       chunkSize, sequentialAccessOutput, false);
     TFIDFConverter.processTfIdf(
       new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
       new Path(outputDir), conf, chunkSize, minDf,
       maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);

     String vectorsFolder = outputDir + "/tfidf-vectors";

     SequenceFile.Reader reader = new SequenceFile.Reader(fs,
         new Path(vectorsFolder, "part-r-00000"), conf);

     Text key = new Text();
     VectorWritable value = new VectorWritable();
     while (reader.next(key, value)) {
       System.out.println(key.toString() + " = > "
                          + value.get().asFormatString());
     }
     reader.close();
   }

}</pre>

本文由用戶 htae2565 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。

轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。

本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！

本文地址：http://www.baiduhome.net/lib/view/open1456659651687.html

算法

使用canopy生成和k-means聚類對新聞進行聚類

相關經驗

相關資訊

相關文檔

目錄