mapreduce案例_词频统计

标签：mapreduce hadoop 案例 job 词频 org apache import class

统计文件中英文单词出现的次数

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class WordMapper extends Mapper<LongWritable, Text,Text,LongWritable>{
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        String string = value.toString();
        String[] split = string.split(" ");
        for (String s : split) {
            context.write(new Text(s),new LongWritable(1));
        }
    }
}

class WordReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        long sum =0L;
        for (LongWritable value : values) {
            long l = value.get();
            sum+=l;
        }
        context.write(key,new LongWritable(sum));
    }
}



public class WordCount {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:9000");

        Job job = Job.getInstance(conf);

        job.setJarByClass(WordCount.class);

        job.setJobName("单词统计案例");

        job.setMapperClass(WordMapper.class);

        job.setReducerClass(WordReducer.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));

        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        if (b){
            System.out.println("单词统计案例mapreduce实现执行成功！>_-");
        }else {
            System.out.println("单词统计案例mapreduce实现执行失败");
        }


    }
}

统计三国演义第一章【玄德，张飞，张角】出现的次数

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.List;

class sgyyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        String line = value.toString();
        Segment segment = HanLP.newSegment();
        CustomDictionary.add("张飞");
        CustomDictionary.add("张角");
        CustomDictionary.add("玄德");

        List<Term> seg = segment.seg(line);
        for (Term term : seg) {
            String word = term.word;
            if ("张飞".equals(word) || "玄德".equals(word) || "张角".equals(word)) {
                context.write(new Text(word), new LongWritable(1L));
            }
        }

    }
}

class sgyyReducer extends Reducer<Text,LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        long sum =0L;
        for (LongWritable value : values) {
            long l = value.get();
            sum+=l;
        }
        context.write(key,new LongWritable(sum));
    }
}

public class sgyyDemo {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://master:9000");

        Job job = Job.getInstance(conf);

        job.setJarByClass(sgyyDemo.class);

        job.setJobName("三国演义人物统计案例");

        job.setMapperClass(sgyyMapper.class);

        job.setReducerClass(sgyyReducer.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));

        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        if (b){
            System.out.println("单词统计案例mapreduce实现执行成功！>_-");
        }else {
            System.out.println("单词统计案例mapreduce实现执行失败");
        }


    }
}

标签：mapreduce,hadoop,案例,job,词频,org,apache,import,class
From： https://www.cnblogs.com/w-ll/p/18528934

相关文章

赞助商

阅读排行