/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.vectorizer;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.apache.mahout.vectorizer.DictionaryVectorizer;
import org.apache.mahout.vectorizer.DocumentProcessor;
import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class SparseVectorsFromSequenceFiles
extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(SparseVectorsFromSequenceFiles.class);

    private SparseVectorsFromSequenceFiles() {
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run((Tool)new SparseVectorsFromSequenceFiles(), (String[])args);
    }

    /*
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    public int run(String[] args) throws Exception {
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();
        DefaultOption inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription("input dir containing the documents in sequence file format").withShortName("i").create();
        DefaultOption outputDirOpt = obuilder.withLongName("output").withRequired(true).withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output directory").withShortName("o").create();
        DefaultOption minSupportOpt = obuilder.withLongName("minSupport").withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();
        DefaultOption analyzerNameOpt = obuilder.withLongName("analyzerName").withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription("The class name of the analyzer").withShortName("a").create();
        DefaultOption chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();
        DefaultOption weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();
        DefaultOption minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();
        DefaultOption maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription("The max percentage of docs for the DF.  Can be used to remove really high frequency terms. Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
        DefaultOption minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is 1.0").withShortName("ml").create();
        DefaultOption numReduceTasksOpt = obuilder.withLongName("numReducers").withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr").create();
        DefaultOption powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription("The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
        DefaultOption logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false).withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false").withShortName("lnorm").create();
        DefaultOption maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create()).withDescription("(Optional) The maximum size of ngrams to create (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
        DefaultOption sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false).withDescription("(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false").withShortName("seq").create();
        DefaultOption namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false).withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false").withShortName("nv").create();
        DefaultOption overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription("If set, overwrite the output directory").withShortName("ow").create();
        DefaultOption helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
        Group group = gbuilder.withName("Options").withOption((Option)minSupportOpt).withOption((Option)analyzerNameOpt).withOption((Option)chunkSizeOpt).withOption((Option)outputDirOpt).withOption((Option)inputDirOpt).withOption((Option)minDFOpt).withOption((Option)maxDFPercentOpt).withOption((Option)weightOpt).withOption((Option)powerOpt).withOption((Option)minLLROpt).withOption((Option)numReduceTasksOpt).withOption((Option)maxNGramSizeOpt).withOption((Option)overwriteOutput).withOption((Option)helpOpt).withOption((Option)sequentialAccessVectorOpt).withOption((Option)namedVectorOpt).withOption((Option)logNormalizeOpt).create();
        try {
            boolean processIdf;
            Parser parser = new Parser();
            parser.setGroup(group);
            parser.setHelpOption((Option)helpOpt);
            CommandLine cmdLine = parser.parse(args);
            if (cmdLine.hasOption((Option)helpOpt)) {
                CommandLineUtil.printHelp(group);
                return -1;
            }
            Path inputDir = new Path((String)cmdLine.getValue((Option)inputDirOpt));
            Path outputDir = new Path((String)cmdLine.getValue((Option)outputDirOpt));
            int chunkSize = 100;
            if (cmdLine.hasOption((Option)chunkSizeOpt)) {
                chunkSize = Integer.parseInt((String)cmdLine.getValue((Option)chunkSizeOpt));
            }
            int minSupport = 2;
            if (cmdLine.hasOption((Option)minSupportOpt)) {
                String minSupportString = (String)cmdLine.getValue((Option)minSupportOpt);
                minSupport = Integer.parseInt(minSupportString);
            }
            int maxNGramSize = 1;
            if (cmdLine.hasOption((Option)maxNGramSizeOpt)) {
                try {
                    maxNGramSize = Integer.parseInt(cmdLine.getValue((Option)maxNGramSizeOpt).toString());
                }
                catch (NumberFormatException ex) {
                    log.warn("Could not parse ngram size option");
                }
            }
            log.info("Maximum n-gram size is: {}", (Object)maxNGramSize);
            if (cmdLine.hasOption((Option)overwriteOutput)) {
                HadoopUtil.delete(this.getConf(), outputDir);
            }
            float minLLRValue = 1.0f;
            if (cmdLine.hasOption((Option)minLLROpt)) {
                minLLRValue = Float.parseFloat(cmdLine.getValue((Option)minLLROpt).toString());
            }
            log.info("Minimum LLR value: {}", (Object)Float.valueOf(minLLRValue));
            int reduceTasks = 1;
            if (cmdLine.hasOption((Option)numReduceTasksOpt)) {
                reduceTasks = Integer.parseInt(cmdLine.getValue((Option)numReduceTasksOpt).toString());
            }
            log.info("Number of reduce tasks: {}", (Object)reduceTasks);
            Class<DefaultAnalyzer> analyzerClass = DefaultAnalyzer.class;
            if (cmdLine.hasOption((Option)analyzerNameOpt)) {
                String className = cmdLine.getValue((Option)analyzerNameOpt).toString();
                analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
                analyzerClass.newInstance();
            }
            if (cmdLine.hasOption((Option)weightOpt)) {
                String wString = cmdLine.getValue((Option)weightOpt).toString();
                if ("tf".equalsIgnoreCase(wString)) {
                    processIdf = false;
                } else {
                    if (!"tfidf".equalsIgnoreCase(wString)) throw new OptionException((Option)weightOpt);
                    processIdf = true;
                }
            } else {
                processIdf = true;
            }
            int minDf = 1;
            if (cmdLine.hasOption((Option)minDFOpt)) {
                minDf = Integer.parseInt(cmdLine.getValue((Option)minDFOpt).toString());
            }
            int maxDFPercent = 99;
            if (cmdLine.hasOption((Option)maxDFPercentOpt)) {
                maxDFPercent = Integer.parseInt(cmdLine.getValue((Option)maxDFPercentOpt).toString());
            }
            float norm = -1.0f;
            if (cmdLine.hasOption((Option)powerOpt)) {
                String power = cmdLine.getValue((Option)powerOpt).toString();
                norm = "INF".equals(power) ? Float.POSITIVE_INFINITY : Float.parseFloat(power);
            }
            boolean logNormalize = false;
            if (cmdLine.hasOption((Option)logNormalizeOpt)) {
                logNormalize = true;
            }
            Configuration conf = this.getConf();
            HadoopUtil.delete(conf, outputDir);
            Path tokenizedPath = new Path(outputDir, "tokenized-documents");
            DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);
            boolean sequentialAccessOutput = false;
            if (cmdLine.hasOption((Option)sequentialAccessVectorOpt)) {
                sequentialAccessOutput = true;
            }
            boolean namedVectors = false;
            if (cmdLine.hasOption((Option)namedVectorOpt)) {
                namedVectors = true;
            }
            if (!processIdf) {
                DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, conf, minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors);
                return 0;
            } else {
                if (!processIdf) return 0;
                DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, conf, minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize, sequentialAccessOutput, namedVectors);
                TFIDFConverter.processTfIdf(new Path(outputDir, "tf-vectors"), outputDir, conf, chunkSize, minDf, maxDFPercent, norm, logNormalize, sequentialAccessOutput, namedVectors, reduceTasks);
            }
            return 0;
        }
        catch (OptionException e) {
            log.error("Exception", (Throwable)e);
            CommandLineUtil.printHelp(group);
        }
        return 0;
    }
}

