package org.wikibrain.sr.word2vec;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.nlp.Dictionary;
import org.wikibrain.core.nlp.StringTokenizer;
import org.wikibrain.phrases.AnchorTextPhraseAnalyzer;
import org.wikibrain.phrases.PhraseAnalyzer;
import org.wikibrain.phrases.PhraseAnalyzerDao;
import org.wikibrain.utils.Scoreboard;
import org.wikibrain.utils.WpIOUtils;

/* loaded from: input_file:org/wikibrain/sr/word2vec/Word2Phrase.class */
public class Word2Phrase {
    private static final Logger LOG = LoggerFactory.getLogger(Word2Phrase.class);
    private final PhraseAnalyzerDao phraseDao;
    private final Language language;
    private int minReduce = 2;
    private int minCount = 5;
    private int threshold = -1;
    private StringTokenizer tokenizer = new StringTokenizer();
    private Dictionary dictionary;

    public Word2Phrase(Language language, PhraseAnalyzerDao phraseAnalyzerDao) {
        this.language = language;
        this.phraseDao = phraseAnalyzerDao;
    }

    public void concatenateBigrams(File file, File file2, int i) throws IOException {
        File[][] fileArr = new File[i - 1][2];
        for (int i2 = 0; i2 < fileArr.length; i2++) {
            fileArr[i2][0] = new File(file2, "phrases.txt." + (i2 + 1));
            fileArr[i2][1] = new File(file2, "phrases.txt." + (i2 + 2));
        }
        fileArr[0][0] = new File(file, "corpus.txt");
        fileArr[fileArr.length - 1][1] = new File(file2, "corpus.txt");
        for (int i3 = 0; i3 < fileArr.length; i3++) {
            LOG.info("pass " + i3 + ": joining phrases of length " + (i3 + 1) + " to " + (i3 + 2));
            File file3 = fileArr[i3][0];
            File file4 = fileArr[i3][1];
            this.dictionary = new Dictionary(this.language, Dictionary.WordStorage.ON_DISK);
            this.dictionary.setCountBigrams(true);
            this.dictionary.countNormalizedFile(file3);
            if (i3 != 0 || this.threshold >= 0) {
                this.threshold = Math.max(5, this.threshold / 3);
            } else {
                this.threshold = learnThreshold(i3 + 2);
            }
            processFile(file3, file4, i3 + 2);
        }
        this.dictionary.write(new File(file2, "dictionary.txt"));
    }

    private void processFile(File file, File file2, int i) throws IOException {
        BufferedReader openBufferedReader = WpIOUtils.openBufferedReader(file);
        BufferedWriter openWriter = WpIOUtils.openWriter(file2);
        while (true) {
            String readLine = openBufferedReader.readLine();
            if (readLine == null) {
                openBufferedReader.close();
                return;
            }
            String[] split = readLine.trim().split(" +");
            StringBuilder sb = new StringBuilder();
            for (int i2 = 0; i2 < split.length; i2++) {
                if (i2 > 0) {
                    if (shouldConcatenate(split[i2 - 1], split[i2], i)) {
                        sb.append('_');
                    } else {
                        sb.append(' ');
                    }
                }
                sb.append(split[i2]);
            }
            sb.append('\n');
            openWriter.write(sb.toString());
        }
    }

    private boolean shouldConcatenate(String str, String str2, int i) {
        return StringUtils.countMatches(str, "_") + 1 == i - 1 && StringUtils.countMatches(str, "_") + 1 == 1 && scoreBigram(str, str2) >= ((double) this.threshold);
    }

    private double scoreBigram(String str, String str2) {
        if (Word2VecUtils.PATTERN_ID.matcher(str).matches()) {
            return 0.0d;
        }
        int unigramCount = this.dictionary.getUnigramCount(str);
        int unigramCount2 = this.dictionary.getUnigramCount(str2);
        if (unigramCount < this.minCount || unigramCount2 < this.minCount) {
            return 0.0d;
        }
        return ((1.0d * (this.dictionary.getBigramCount(str, str2) - this.minCount)) * this.dictionary.getTotalCount()) / (unigramCount * unigramCount2);
    }

    public int learnThreshold(int i) {
        List<String[]> knownBigrams = getKnownBigrams(i);
        if (knownBigrams.isEmpty()) {
            throw new IllegalStateException("Found no anchor texts of length " + i);
        }
        List<String[]> nonBigramSample = getNonBigramSample(knownBigrams, knownBigrams.size());
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (String[] strArr : knownBigrams) {
            arrayList.add(Double.valueOf(scoreBigram(strArr[0], strArr[1])));
        }
        for (String[] strArr2 : nonBigramSample) {
            arrayList2.add(Double.valueOf(scoreBigram(strArr2[0], strArr2[1])));
        }
        Collections.sort(arrayList);
        Collections.sort(arrayList2);
        double d = 0.0d;
        int i2 = 0;
        for (int i3 = 0; i3 < 1000; i3++) {
            int binarySearch = Collections.binarySearch(arrayList, Double.valueOf(i3));
            int binarySearch2 = Collections.binarySearch(arrayList2, Double.valueOf(i3));
            int size = arrayList.size() - Math.abs(binarySearch);
            if (size + (arrayList2.size() - Math.abs(binarySearch2)) > 0) {
                double d2 = (1.0d * size) / (size + r0);
                double size2 = (1.0d * size) / arrayList.size();
                double d3 = (d2 * size2) / ((0.25d * d2) + size2);
                if (d3 > d) {
                    i2 = i3;
                    d = d3;
                }
            }
        }
        LOG.info("learned threshold " + i2 + " for words of length " + i + " with " + knownBigrams.size() + " known bigrams");
        return i2;
    }

    private List<String[]> getKnownBigrams(int i) {
        ArrayList arrayList = new ArrayList();
        Iterator allPhrases = this.phraseDao.getAllPhrases(this.language);
        while (allPhrases.hasNext()) {
            List words = this.tokenizer.getWords(this.language, (String) allPhrases.next());
            if (words.size() == i) {
                String join = StringUtils.join(words.subList(0, words.size() - 1), '_');
                String str = (String) words.get(words.size() - 1);
                int unigramCount = this.dictionary.getUnigramCount(join);
                int unigramCount2 = this.dictionary.getUnigramCount(str);
                if (unigramCount >= this.minCount && unigramCount2 >= this.minCount) {
                    arrayList.add(new String[]{join, str});
                }
            }
        }
        return arrayList;
    }

    private List<String[]> getNonBigramSample(List<String[]> list, int i) {
        Scoreboard scoreboard = new Scoreboard(1000);
        Scoreboard scoreboard2 = new Scoreboard(1000);
        HashSet hashSet = new HashSet();
        for (String[] strArr : list) {
            scoreboard.add(strArr[0], this.dictionary.getUnigramCount(strArr[0]));
            scoreboard2.add(strArr[1], this.dictionary.getUnigramCount(strArr[1]));
            hashSet.add(strArr[0] + "_" + strArr[1]);
        }
        Random random = new Random();
        ArrayList arrayList = new ArrayList();
        while (arrayList.size() < i) {
            String str = (String) scoreboard.getElement(random.nextInt(scoreboard.size()));
            String str2 = (String) scoreboard2.getElement(random.nextInt(scoreboard2.size()));
            if (!hashSet.contains(str + "_" + str2)) {
                arrayList.add(new String[]{str, str2});
            }
        }
        return arrayList;
    }

    public static void main(String[] strArr) throws IOException, ConfigurationException {
        Options options = new Options();
        options.addOption(new DefaultOptionBuilder().hasArg().isRequired().withLongOpt("input").withDescription("corpus input directory").create("i"));
        options.addOption(new DefaultOptionBuilder().hasArg().isRequired().withLongOpt("output").withDescription("corpus output directory (existing data will be lost)").create("o"));
        options.addOption(new DefaultOptionBuilder().hasArg().withLongOpt("minCount").withDescription("minimum frequency for unigrams that should be collapsed").create("m"));
        options.addOption(new DefaultOptionBuilder().hasArg().withLongOpt("maxngram").withDescription("maximum number of words that should be concatenated together").create("g"));
        EnvBuilder.addStandardOptions(options);
        try {
            CommandLine parse = new PosixParser().parse(options, strArr);
            Env build = new EnvBuilder(parse).build();
            new Word2Phrase(build.getLanguages().getDefaultLanguage(), ((AnchorTextPhraseAnalyzer) build.getConfigurator().get(PhraseAnalyzer.class, "anchortext")).getDao()).concatenateBigrams(new File(parse.getOptionValue("i")), new File(parse.getOptionValue("o")), Integer.valueOf(parse.getOptionValue("g", "4")).intValue());
        } catch (ParseException e) {
            System.err.println("Invalid option usage: " + e.getMessage());
            new HelpFormatter().printHelp("Word2Phrase", options);
        }
    }
}
