/*
 * Decompiled with CFR 0.152.
 */
package eu.dicodeproject.analysis.hbase;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.vectorizer.DefaultAnalyzer;

public final class HBaseDocumentProcessor {
    public static final String ANALYZER_CLASS = "analyzer.class";

    private HBaseDocumentProcessor() {
    }

    public static void tokenizeDocuments(String table, String family, String column, Class<? extends Analyzer> analyzerClass, Path output) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = HBaseConfiguration.create();
        conf.set(ANALYZER_CLASS, analyzerClass.getName());
        Job job = new Job(conf);
        job.setJarByClass(HBaseDocumentProcessor.class);
        Scan scan = new Scan();
        scan.addColumn(Bytes.toBytes((String)family), Bytes.toBytes((String)column));
        TableMapReduceUtil.initTableMapperJob((String)table, (Scan)scan, HBaseDocumentProcessorMapper.class, Text.class, StringTuple.class, (Job)job);
        job.setJobName("HBaseDocumentProcessor::DocumentTokenizer");
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(StringTuple.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)output);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(0);
        job.waitForCompletion(true);
    }

    private static class HBaseDocumentProcessorMapper
    extends TableMapper<Text, StringTuple> {
        private Analyzer analyzer;

        private HBaseDocumentProcessorMapper() {
        }

        public void setup(Mapper.Context context) throws IOException, InterruptedException {
            super.setup(context);
            try {
                ClassLoader ccl = Thread.currentThread().getContextClassLoader();
                Class<?> cl = ccl.loadClass(context.getConfiguration().get(HBaseDocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()));
                this.analyzer = (Analyzer)cl.newInstance();
            }
            catch (ClassNotFoundException e) {
                throw new IllegalStateException(e);
            }
            catch (InstantiationException e) {
                throw new IllegalStateException(e);
            }
            catch (IllegalAccessException e) {
                throw new IllegalStateException(e);
            }
        }

        public void map(ImmutableBytesWritable row, Result values, Mapper.Context context) throws IOException, InterruptedException {
            for (KeyValue keyValue : values.list()) {
                String key = new String(keyValue.getKey());
                String value = new String(keyValue.getValue());
                TokenStream stream = this.analyzer.tokenStream(key, (Reader)new StringReader(value));
                TermAttribute termAtt = (TermAttribute)stream.addAttribute(TermAttribute.class);
                StringTuple document = new StringTuple();
                while (stream.incrementToken()) {
                    if (termAtt.termLength() <= 0) continue;
                    document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
                }
                context.write((Object)new Text(key), (Object)document);
            }
        }
    }
}

