Hbase基于Mapreduce的編程

openkk 12年前發布 | 34K 次閱讀 分布式/云計算/大數據 HBase

小試牛刀,將mapreduce的輸出結果保存到大型分布式數據庫中HBase中,一個例子,求各url的訪問pv數據,由于用到rcfile格式需要導入hive-exce包,還需要加載hbase包,如果這兩個包都已經被集群管理員放到各節點的hadoop/lib下那就可以省去這一步,廢話不說,干貨,看代碼:

package test.hbase;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import com.sohu.tv.dm.common.RCFileInputFormat;

public class URLCountHbase {
    public static class HBaseMap extends
            Mapper<LongWritable, BytesRefArrayWritable, Text, IntWritable> {

        private IntWritable i = new IntWritable(1);

        @Override
        protected void map(LongWritable key, BytesRefArrayWritable value,
                Context context) throws IOException, InterruptedException {
            byte[] url = value.get(4).getBytesCopy();
            context.write(new Text(url), i);
        }

    }

    public static class HBaseReduce extends
            TableReducer<Text, IntWritable, NullWritable> {

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable i : values) {
                sum += i.get();
            }
            Put put = new Put(Bytes.toBytes(key.toString()));
            put.add(Bytes.toBytes("type"), Bytes.toBytes("count"),
                    Bytes.toBytes(String.valueOf(sum)));
            context.write(NullWritable.get(), put);
        }

    }

    public static void createHbaseTable(String tablename) throws IOException {
        HTableDescriptor htd = new HTableDescriptor(tablename);
        HColumnDescriptor col = new HColumnDescriptor("type");
        htd.addFamily(col);
        HBaseConfiguration config = new HBaseConfiguration();
        HBaseAdmin admin = new HBaseAdmin(config);
        if (admin.tableExists(tablename)) {
            System.out.println("table exists, trying recreate table");
            admin.disableTable(tablename);
            admin.deleteTable(tablename);
        }
        System.out.println("create new table:" + tablename);
        admin.createTable(htd);

    }

    public static void main(String args[]) throws Exception {
        String tablename = "urlcount";
        Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.getLocal(conf);
        final HashSet<String> localfiles = new HashSet<String>();
        localfiles.add("/opt/hadoop/hive-0.8.1/lib/hive-exec-0.8.1.jar");
        localfiles.add("/opt/hadoop/hbase/hbase-0.92.1.jar");
        final HashSet<String> files = new HashSet<String>();
        for (String s : localfiles) {
            files.add(URLCountHbase.convertPath(s, fs));
        }
        URLCountHbase.cacheJars(conf, files);
        conf.set(TableOutputFormat.OUTPUT_TABLE, tablename);
        createHbaseTable(tablename);
        Job job = new Job(conf, "WordCount table with " + args[0]);
        job.setJarByClass(URLCountHbase.class);
        job.setNumReduceTasks(3);
        job.setReducerClass(HBaseReduce.class);
        job.setMapperClass(HBaseMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputFormatClass(TableOutputFormat.class);
        job.setInputFormatClass(RCFileInputFormat.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

     private static String convertPath(String path, FileSystem fs) {
            final Path p = new Path(path);
            return p.makeQualified(fs).toString();
     }

     private static void cacheJars(Configuration job, Set<String> localUrls) throws IOException {
         if (localUrls.isEmpty()) {
                return;
            }
            final String tmpjars = job.get("tmpjars");
            final StringBuilder sb = new StringBuilder();
            if (null != tmpjars) {
                sb.append(tmpjars);
                sb.append(",");
            }
            sb.append(org.apache.hadoop.util.StringUtils.arrayToString(localUrls.toArray(new String[0])));
            job.set("tmpjars", sb.toString());
      }
}

 本文由用戶 openkk 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!