/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.vectorizer;

import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.collocations.llr.CollocDriver;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.term.TFPartialVectorReducer;
import org.apache.mahout.vectorizer.term.TermCountMapper;
import org.apache.mahout.vectorizer.term.TermCountReducer;

public final class DictionaryVectorizer {
    public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
    public static final String MIN_SUPPORT = "min.support";
    public static final String MAX_NGRAMS = "max.ngrams";
    public static final int DEFAULT_MIN_SUPPORT = 2;
    private static final String DICTIONARY_FILE = "dictionary.file-";
    private static final int MAX_CHUNKSIZE = 10000;
    private static final int MIN_CHUNKSIZE = 100;
    private static final String OUTPUT_FILES_PATTERN = "part-*";
    private static final int DICTIONARY_BYTE_OVERHEAD = 4;
    private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
    private static final String DICTIONARY_JOB_FOLDER = "wordcount";

    private DictionaryVectorizer() {
    }

    public static void createTermFrequencyVectors(Path input, Path output, Configuration baseConf, int minSupport, int maxNGramSize, float minLLRValue, float normPower, boolean logNormalize, int numReducers, int chunkSizeInMegabytes, boolean sequentialAccess, boolean namedVectors) throws IOException, InterruptedException, ClassNotFoundException {
        List<Path> dictionaryChunks;
        Preconditions.checkArgument((normPower == -1.0f || normPower >= 0.0f ? 1 : 0) != 0, (String)"If specified normPower must be nonnegative", (Object[])new Object[]{Float.valueOf(normPower)});
        Preconditions.checkArgument((normPower == -1.0f || normPower > 1.0f && !Double.isInfinite(normPower) || !logNormalize ? 1 : 0) != 0, (String)"normPower must be > 1 and not infinite if log normalization is chosen", (Object[])new Object[]{Float.valueOf(normPower)});
        if (chunkSizeInMegabytes < 100) {
            chunkSizeInMegabytes = 100;
        } else if (chunkSizeInMegabytes > 10000) {
            chunkSizeInMegabytes = 10000;
        }
        if (minSupport < 0) {
            minSupport = 2;
        }
        Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER);
        int[] maxTermDimension = new int[1];
        if (maxNGramSize == 1) {
            DictionaryVectorizer.startWordCounting(input, dictionaryJobPath, baseConf, minSupport);
            dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(dictionaryJobPath, output, baseConf, chunkSizeInMegabytes, maxTermDimension);
        } else {
            CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue, numReducers);
            dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(new Path(new Path(output, DICTIONARY_JOB_FOLDER), "ngrams"), output, baseConf, chunkSizeInMegabytes, maxTermDimension);
        }
        int partialVectorIndex = 0;
        ArrayList<Path> partialVectorPaths = new ArrayList<Path>();
        for (Path dictionaryChunk : dictionaryChunks) {
            Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
            partialVectorPaths.add(partialVectorOutputPath);
            DictionaryVectorizer.makePartialVectors(input, baseConf, maxNGramSize, dictionaryChunk, partialVectorOutputPath, maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
        }
        Configuration conf = new Configuration(baseConf);
        Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, conf, normPower, logNormalize, maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
        HadoopUtil.delete(conf, partialVectorPaths);
    }

    private static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
        ArrayList<Path> chunkPaths = new ArrayList<Path>();
        Configuration conf = new Configuration(baseConf);
        FileSystem fs = FileSystem.get((URI)wordCountPath.toUri(), (Configuration)conf);
        long chunkSizeLimit = (long)chunkSizeInMegabytes * 1024L * 1024L;
        int chunkIndex = 0;
        Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
        chunkPaths.add(chunkPath);
        SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
        long currentChunkSize = 0L;
        Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
        int i = 0;
        for (Pair record : new SequenceFileDirIterable(filesPattern, PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                dictWriter.close();
                chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + ++chunkIndex);
                chunkPaths.add(chunkPath);
                dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                currentChunkSize = 0L;
            }
            Writable key = (Writable)record.getFirst();
            int fieldSize = 4 + key.toString().length() * 2 + 4;
            currentChunkSize += (long)fieldSize;
            dictWriter.append(key, (Writable)new IntWritable(i++));
        }
        maxTermDimension[0] = i;
        dictWriter.close();
        return chunkPaths;
    }

    private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration(baseConf);
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        conf.setInt("vector.dimension", dimension);
        conf.setBoolean("vector.sequentialAccess", sequentialAccess);
        conf.setBoolean("vector.named", namedVectors);
        conf.setInt(MAX_NGRAMS, maxNGramSize);
        DistributedCache.setCacheFiles((URI[])new URI[]{dictionaryFilePath.toUri()}, (Configuration)conf);
        Job job = new Job(conf);
        job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath);
        job.setJarByClass(DictionaryVectorizer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(StringTuple.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(VectorWritable.class);
        FileInputFormat.setInputPaths((Job)job, (Path[])new Path[]{input});
        FileOutputFormat.setOutputPath((Job)job, (Path)output);
        job.setMapperClass(Mapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setReducerClass(TFPartialVectorReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(numReducers);
        HadoopUtil.delete(conf, output);
        job.waitForCompletion(true);
    }

    private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration(baseConf);
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        conf.setInt(MIN_SUPPORT, minSupport);
        Job job = new Job(conf);
        job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
        job.setJarByClass(DictionaryVectorizer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        FileInputFormat.setInputPaths((Job)job, (Path[])new Path[]{input});
        FileOutputFormat.setOutputPath((Job)job, (Path)output);
        job.setMapperClass(TermCountMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setCombinerClass(TermCountReducer.class);
        job.setReducerClass(TermCountReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        HadoopUtil.delete(conf, output);
        job.waitForCompletion(true);
    }
}

