使用canopy生成和k-means聚類對新聞進行聚類

htae2565 8年前發布 | 16K 次閱讀 算法

來自: http://blog.csdn.net/u012965373/article/details/50754420


/**

  • @author YangXin
  • @info 使用canopy生成和k-means聚類對新聞進行聚類 */ package unitNine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.lucene.analysis.Analyzer; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.math.VectorWritable; import org.apache.mahout.vectorizer.DictionaryVectorizer; import org.apache.mahout.vectorizer.DocumentProcessor; import org.apache.mahout.vectorizer.tfidf.TFIDFConverter; public class ReutersToSparseVectors { public static void main(String args[]) throws Exception {

         int minSupport = 5;
         int minDf = 5;
         int maxDFPercent = 95;
         int maxNGramSize = 1;
         float minLLRValue = 50;
         int reduceTasks = 1;
         int chunkSize = 200;
         int norm = 2;
         boolean sequentialAccessOutput = true;
    
         String inputDir = "inputDir";
    
         Configuration conf = new Configuration();
         FileSystem fs = FileSystem.get(conf);
    
         String outputDir = "reuters";
         HadoopUtil.delete(conf, new Path(outputDir));
         Path tokenizedPath = new Path(outputDir,
             DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
         MyAnalyzer analyzer = new MyAnalyzer();
         DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass()
             .asSubclass(Analyzer.class), tokenizedPath, conf);
    
         DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
           new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,
           chunkSize, sequentialAccessOutput, false);
         TFIDFConverter.processTfIdf(
           new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
           new Path(outputDir), conf, chunkSize, minDf,
           maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);
    
         String vectorsFolder = outputDir + "/tfidf-vectors";
    
         SequenceFile.Reader reader = new SequenceFile.Reader(fs,
             new Path(vectorsFolder, "part-r-00000"), conf);
    
         Text key = new Text();
         VectorWritable value = new VectorWritable();
         while (reader.next(key, value)) {
           System.out.println(key.toString() + " = > "
                              + value.get().asFormatString());
         }
         reader.close();
       }
    

    }</pre>

 本文由用戶 htae2565 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!