package org.wikibrain.sr.dataset;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.nlp.Dictionary;
import org.wikibrain.sr.utils.KnownSim;
import org.wikibrain.sr.wikify.Corpus;
import org.wikibrain.sr.wikify.WbCorpusLineReader;
import org.wikibrain.utils.Scoreboard;

/* loaded from: input_file:org/wikibrain/sr/dataset/FakeDatasetCreator.class */
public class FakeDatasetCreator {
    private int stopWordRank = 1000;
    private int maxTargetRank = 1000;
    private int maxCandidateRank = 30000;
    private final Dictionary dictionary;
    private final File path;
    private final Language lang;

    public FakeDatasetCreator(Language language, File file) throws IOException {
        this.lang = language;
        this.path = file;
        this.dictionary = new Dictionary(language, Dictionary.WordStorage.IN_MEMORY);
        this.dictionary.countNormalizedFile(file);
    }

    public FakeDatasetCreator(Corpus corpus) throws IOException {
        this.dictionary = new Dictionary(corpus.getLanguage(), Dictionary.WordStorage.IN_MEMORY);
        this.dictionary.read(corpus.getDictionaryFile());
        this.path = corpus.getCorpusFile();
        this.lang = corpus.getLanguage();
    }

    public void setStopWordRank(int i) {
        this.stopWordRank = i;
    }

    public void setMaxTargetRank(int i) {
        this.maxTargetRank = i;
    }

    public void setMaxCandidateRank(int i) {
        this.maxCandidateRank = i;
    }

    public Dataset generate(int i) throws IOException {
        ArrayList arrayList = new ArrayList();
        Pattern compile = Pattern.compile(".*[0-9].*");
        for (String str : this.dictionary.getFrequentUnigrams(this.maxCandidateRank * 3)) {
            if (!compile.matcher(str).find() && !Character.isUpperCase(str.charAt(0))) {
                arrayList.add(str);
                if (arrayList.size() >= this.maxCandidateRank) {
                    break;
                }
            }
        }
        if (arrayList.size() < this.stopWordRank) {
            throw new IllegalArgumentException();
        }
        List subList = arrayList.subList(this.stopWordRank, arrayList.size());
        HashMap hashMap = new HashMap();
        Iterator it = subList.iterator();
        while (it.hasNext()) {
            hashMap.put((String) it.next(), Integer.valueOf(hashMap.size()));
        }
        ArrayList arrayList2 = new ArrayList(subList.size() > this.maxTargetRank ? subList.subList(0, this.maxTargetRank) : subList);
        Collections.shuffle(arrayList2);
        HashSet hashSet = new HashSet(hashMap.size() <= i ? arrayList2 : arrayList2.subList(0, i));
        HashMap hashMap2 = new HashMap();
        Iterator it2 = hashSet.iterator();
        while (it2.hasNext()) {
            hashMap2.put((String) it2.next(), new int[hashMap.size()]);
        }
        HashSet<String> hashSet2 = new HashSet();
        Iterator<WbCorpusLineReader.Line> it3 = new WbCorpusLineReader(this.path).iterator();
        while (it3.hasNext()) {
            String[] split = it3.next().getLine().split("\\s+");
            hashSet2.clear();
            for (int i2 = 0; i2 < split.length; i2++) {
                if (hashSet.contains(split[i2])) {
                    hashSet2.add(split[i2]);
                }
            }
            if (!hashSet2.isEmpty()) {
                for (String str2 : hashSet2) {
                    for (String str3 : split) {
                        if (hashMap.containsKey(str3)) {
                            int intValue = ((Integer) hashMap.get(str3)).intValue();
                            int[] iArr = (int[]) hashMap2.get(str2);
                            iArr[intValue] = iArr[intValue] + 1;
                        }
                    }
                }
            }
        }
        arrayList2.clear();
        arrayList2.addAll(hashMap2.keySet());
        Collections.shuffle(arrayList2);
        ArrayList arrayList3 = new ArrayList();
        double pow = Math.pow(subList.size() / 3, 1.0d / i);
        for (int i3 = 0; i3 < arrayList2.size(); i3++) {
            String str4 = (String) arrayList2.get(i3);
            final double[] dArr = new double[((int[]) hashMap2.get(str4)).length];
            Scoreboard scoreboard = new Scoreboard(10);
            int unigramCount = this.dictionary.getUnigramCount(str4);
            for (int i4 = 0; i4 < subList.size(); i4++) {
                dArr[i4] = Math.log(r0[i4] / ((unigramCount + 5.0d) * (this.dictionary.getUnigramCount((String) subList.get(i4)) + 5.0d)));
                scoreboard.add(subList.get(i4), dArr[i4]);
            }
            Integer[] numArr = new Integer[subList.size()];
            for (int i5 = 0; i5 < numArr.length; i5++) {
                numArr[i5] = Integer.valueOf(i5);
            }
            Arrays.sort(numArr, new Comparator<Integer>() { // from class: org.wikibrain.sr.dataset.FakeDatasetCreator.1
                @Override // java.util.Comparator
                public int compare(Integer num, Integer num2) {
                    return (-1) * new Double(dArr[num.intValue()]).compareTo(Double.valueOf(dArr[num2.intValue()]));
                }
            });
            arrayList3.add(new KnownSim(str4.replace('_', ' '), ((String) subList.get(numArr[(int) Math.round(Math.pow(pow, i3))].intValue())).replace('_', ' '), 1.0d - ((1.0d * i3) / arrayList2.size()), this.lang));
        }
        return new Dataset("fake", this.lang, arrayList3);
    }
}
