基于模糊K-Means算法的新聞聚類

MamieH92 8年前發布 | 19K 次閱讀 算法

來自: http://blog.csdn.net/u012965373/article/details/50754449


<strong>/***

  • @author YangXin
  • @info 基于模糊K-Means算法的新聞聚類 */ package unitNine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.lucene.analysis.Analyzer; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.classify.WeightedVectorWritable; import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.distance.ManhattanDistanceMeasure; import org.apache.mahout.common.distance.TanimotoDistanceMeasure; import org.apache.mahout.vectorizer.DictionaryVectorizer; import org.apache.mahout.vectorizer.DocumentProcessor; import org.apache.mahout.vectorizer.tfidf.TFIDFConverter; public class NewsFuzzyKMeansClustering { public static void main(String args[]) throws Exception {

     int minSupport = 5;
     int minDf = 10;
     int maxDFPercent = 70;
     int maxNGramSize = 1;
     int minLLRValue = 200;
     int reduceTasks = 1;
     int chunkSize = 200;
     int norm = 2;
     boolean sequentialAccessOutput = true;
    
     String inputDir = "inputDir";
    
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     /*
     SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
         new Path(inputDir, "documents.seq"), Text.class, Text.class);
     for (Document d : Database) {
       writer.append(new Text(d.getID()), new Text(d.contents()));
     }
     writer.close();*/
    
     String outputDir = "newsClusters";
     HadoopUtil.delete(conf, new Path(outputDir));
    
     Path tokenizedPath = new Path(outputDir,
         DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
     MyAnalyzer analyzer = new MyAnalyzer();
     DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass()
         .asSubclass(Analyzer.class), tokenizedPath, conf);
    
     DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
       new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,
       chunkSize, sequentialAccessOutput, false);
     TFIDFConverter.processTfIdf(
       new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
       new Path(outputDir), conf, chunkSize, minDf,
       maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);
     String vectorsFolder = outputDir + "/tfidf-vectors";
     String canopyCentroids = outputDir + "/canopy-centroids";
     String clusterOutput = outputDir + "/clusters/";
    
     CanopyDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids),
       new ManhattanDistanceMeasure(), 3000.0, 2000.0, false, false);
    
     FuzzyKMeansDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids, "clusters-0"), new Path(clusterOutput),
       new TanimotoDistanceMeasure(), 0.01, 20, 2.0f, true, true, 0.0, false);
    
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(
       clusterOutput + Cluster.CLUSTERED_POINTS_DIR +"/part-m-00000"), conf);
    
     IntWritable key = new IntWritable();
     WeightedVectorWritable value = new WeightedVectorWritable();
     while (reader.next(key, value)) {
       System.out.println("Cluster: " + key.toString() + " "
                          + value.getVector().asFormatString());
     }
     reader.close();
    

    } } </strong></pre>

 本文由用戶 MamieH92 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!