/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.vectorizer.tfidf;

import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.term.TermDocumentCountMapper;
import org.apache.mahout.vectorizer.term.TermDocumentCountReducer;
import org.apache.mahout.vectorizer.tfidf.TFIDFPartialVectorReducer;

public final class TFIDFConverter {
    public static final String VECTOR_COUNT = "vector.count";
    public static final String FEATURE_COUNT = "feature.count";
    public static final String MIN_DF = "min.df";
    public static final String MAX_DF_PERCENTAGE = "max.df.percentage";
    private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
    private static final String FREQUENCY_FILE = "frequency.file-";
    private static final int MAX_CHUNKSIZE = 10000;
    private static final int MIN_CHUNKSIZE = 100;
    private static final String OUTPUT_FILES_PATTERN = "part-*";
    private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
    private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
    private static final String WORDCOUNT_OUTPUT_FOLDER = "df-count";

    private TFIDFConverter() {
    }

    public static void processTfIdf(Path input, Path output, Configuration baseConf, int chunkSizeInMegabytes, int minDf, int maxDFPercent, float normPower, boolean logNormalize, boolean sequentialAccessOutput, boolean namedVector, int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
        Preconditions.checkArgument((normPower == -1.0f || normPower >= 0.0f ? 1 : 0) != 0, (String)"If specified normPower must be nonnegative", (Object[])new Object[]{Float.valueOf(normPower)});
        Preconditions.checkArgument((normPower == -1.0f || normPower > 1.0f && !Double.isInfinite(normPower) || !logNormalize ? 1 : 0) != 0, (String)"normPower must be > 1 and not infinite if log normalization is chosen", (Object[])new Object[]{Float.valueOf(normPower)});
        if (chunkSizeInMegabytes < 100) {
            chunkSizeInMegabytes = 100;
        } else if (chunkSizeInMegabytes > 10000) {
            chunkSizeInMegabytes = 10000;
        }
        if (minDf < 1) {
            minDf = 1;
        }
        if (maxDFPercent < 0 || maxDFPercent > 100) {
            maxDFPercent = 99;
        }
        Path wordCountPath = new Path(output, WORDCOUNT_OUTPUT_FOLDER);
        TFIDFConverter.startDFCounting(input, wordCountPath, baseConf);
        Pair<Long[], List<Path>> datasetFeatures = TFIDFConverter.createDictionaryChunks(wordCountPath, output, baseConf, chunkSizeInMegabytes);
        int partialVectorIndex = 0;
        ArrayList<Path> partialVectorPaths = new ArrayList<Path>();
        List<Path> dictionaryChunks = datasetFeatures.getSecond();
        for (Path dictionaryChunk : dictionaryChunks) {
            Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
            partialVectorPaths.add(partialVectorOutputPath);
            TFIDFConverter.makePartialVectors(input, baseConf, datasetFeatures.getFirst()[0], datasetFeatures.getFirst()[1], minDf, maxDFPercent, dictionaryChunk, partialVectorOutputPath, sequentialAccessOutput, namedVector);
        }
        Configuration conf = new Configuration(baseConf);
        Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, baseConf, normPower, logNormalize, datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, namedVector, numReducers);
        HadoopUtil.delete(conf, partialVectorPaths);
    }

    private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException {
        ArrayList<Path> chunkPaths = new ArrayList<Path>();
        Configuration conf = new Configuration(baseConf);
        FileSystem fs = FileSystem.get((URI)featureCountPath.toUri(), (Configuration)conf);
        long chunkSizeLimit = (long)chunkSizeInMegabytes * 1024L * 1024L;
        int chunkIndex = 0;
        Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
        chunkPaths.add(chunkPath);
        SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
        long currentChunkSize = 0L;
        long featureCount = 0L;
        long vectorCount = Long.MAX_VALUE;
        Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
        for (Pair record : new SequenceFileDirIterable(filesPattern, PathType.GLOB, null, null, true, conf)) {
            if (currentChunkSize > chunkSizeLimit) {
                freqWriter.close();
                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + ++chunkIndex);
                chunkPaths.add(chunkPath);
                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
                currentChunkSize = 0L;
            }
            int fieldSize = 57;
            currentChunkSize += (long)fieldSize;
            IntWritable key = (IntWritable)record.getFirst();
            LongWritable value = (LongWritable)record.getSecond();
            if (key.get() >= 0) {
                freqWriter.append((Writable)key, (Writable)value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max((long)key.get(), featureCount);
        }
        freqWriter.close();
        Long[] counts = new Long[]{++featureCount, vectorCount};
        return new Pair<Long[], List<Path>>(counts, chunkPaths);
    }

    private static void makePartialVectors(Path input, Configuration baseConf, Long featureCount, Long vectorCount, int minDf, int maxDFPercent, Path dictionaryFilePath, Path output, boolean sequentialAccess, boolean namedVector) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration(baseConf);
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        conf.setLong(FEATURE_COUNT, featureCount.longValue());
        conf.setLong(VECTOR_COUNT, vectorCount.longValue());
        conf.setInt(MIN_DF, minDf);
        conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
        conf.setBoolean("vector.sequentialAccess", sequentialAccess);
        conf.setBoolean("vector.named", namedVector);
        DistributedCache.setCacheFiles((URI[])new URI[]{dictionaryFilePath.toUri()}, (Configuration)conf);
        Job job = new Job(conf);
        job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());
        job.setJarByClass(TFIDFConverter.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(VectorWritable.class);
        FileInputFormat.setInputPaths((Job)job, (Path[])new Path[]{input});
        FileOutputFormat.setOutputPath((Job)job, (Path)output);
        job.setMapperClass(Mapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setReducerClass(TFIDFPartialVectorReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        HadoopUtil.delete(conf, output);
        job.waitForCompletion(true);
    }

    private static void startDFCounting(Path input, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration(baseConf);
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        Job job = new Job(conf);
        job.setJobName("VectorTfIdf Document Frequency Count running over input: " + input);
        job.setJarByClass(TFIDFConverter.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LongWritable.class);
        FileInputFormat.setInputPaths((Job)job, (Path[])new Path[]{input});
        FileOutputFormat.setOutputPath((Job)job, (Path)output);
        job.setMapperClass(TermDocumentCountMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setCombinerClass(TermDocumentCountReducer.class);
        job.setReducerClass(TermDocumentCountReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        HadoopUtil.delete(conf, output);
        job.waitForCompletion(true);
    }
}

