Hadoop 統計文件中某個單詞出現的次數

jopen 11年前發布 | 39K 次閱讀 Hadoop 分布式/云計算/大數據

如文件word.txt內容如下:

what is you name?

my name is zhang san。

要求統計word.txt中出現“is”的次數?

 

代碼如下:

PerWordMapper

import java.io.IOException;  
import java.util.StringTokenizer;  

import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Mapper;  

public class PerWordMapper extends Mapper<Object, Text, Text, IntWritable> {  

    public Text keyText = new Text();  
    public IntWritable intValue = new IntWritable(1);  

    @Override  
    protected void map(Object key, Text value,  
            Context context)  
            throws IOException, InterruptedException {  
        String str = value.toString();  
        StringTokenizer to = new StringTokenizer(str);  
        while (to.hasMoreTokens()) {  
            String t = to.nextToken();  
            //此處為判斷統計字符串的地方  
            if(t.equals("is")){  
                keyText = new Text(t);  
                context.write(keyText, intValue);  
            }  

         }  
    }  
}  

PerWordReducer

import java.io.IOException;  

import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Reducer;  

public class PerWordReducer extends Reducer<Text, IntWritable, Text, IntWritable> {  

    public IntWritable intValue = new IntWritable(0);  
    @Override  
    protected void reduce(Text key, Iterable<IntWritable> value,  
            Context context)  
            throws IOException, InterruptedException {  
        int sum = 0;  
        while(value.iterator().hasNext()){  
            sum += value.iterator().next().get();  
        }  
        intValue.set(sum);  
        context.write(key, intValue);  
    }  

}  
PerWordCount
import java.io.IOException;  

import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.util.GenericOptionsParser;  

import com.hadoop.mapreducer.MapperClass;  
import com.hadoop.mapreducer.ReducerClass;  
import com.hadoop.mapreducer.WordCount;  

public class PerWordCount {  
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {  
        Configuration conf = new Configuration();  
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
        System.out.println("otherArgs.length:"+otherArgs.length);  
        if (otherArgs.length != 2) {  
          System.err.println("Usage: wordcount <in> <out>");  
          System.exit(2);  
        }  
        Job job = new Job(conf, "word count");  
        job.setJarByClass(PerWordCount.class);  
        job.setMapperClass(PerWordMapper.class);  
        job.setCombinerClass(PerWordReducer.class);  
        job.setReducerClass(PerWordReducer.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(IntWritable.class);  
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
        System.exit(job.waitForCompletion(true) ? 0 : 1);  
    }  

}  

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!