/*
 * Decompiled with CFR 0.152.
 */
package eu.openminted.uc.socialsciences.variabledetection.features;

import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordLemmatizer;
import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
import eu.openminted.uc.socialsciences.variabledetection.pipelines.VariableDisambiguationConstants;
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

public class WordIdfValuesGenerator {
    static final String LF = System.getProperty("line.separator");
    private static AnalysisEngine lemmaEngine;
    private static JCas lemmaJCas;

    public static void computeIdfScores(VariableDisambiguationConstants.Dataset aTarget, VariableDisambiguationConstants.Mode mode, List<VariableDisambiguationConstants.Dataset> datasets) throws Exception {
        ArrayList lines = new ArrayList();
        for (VariableDisambiguationConstants.Dataset dataset : datasets) {
            URL inputUrl = ResourceUtils.resolveLocation((String)("classpath:/datasets/semeval-2012/" + mode.toString().toLowerCase() + "/STS.input." + dataset.toString() + ".txt"));
            lines.addAll(IOUtils.readLines((InputStream)inputUrl.openStream(), (String)"utf-8"));
        }
        HashMap<String, Double> idfValues = new HashMap<String, Double>();
        File outputFile = new File("../data/utils/word-idf/" + mode.toString().toLowerCase() + "/" + (Object)((Object)aTarget) + ".txt");
        System.out.println("Computing word idf values");
        if (outputFile.exists()) {
            System.out.println(" - skipping, already exists");
        } else {
            System.out.println(" - this may take a while...");
            HashSet docs = new HashSet();
            for (Iterator<Object> line : lines) {
                ArrayList<String> arrayList = new ArrayList<String>();
                for (Lemma lemma : WordIdfValuesGenerator.getLemmas((String)((Object)line))) {
                    try {
                        String token = lemma.getValue().toLowerCase();
                        arrayList.add(token);
                    }
                    catch (NullPointerException e) {
                        System.err.println(" - unparsable token: " + lemma.getCoveredText());
                    }
                }
                docs.add(arrayList);
            }
            HashSet tokens = new HashSet();
            for (List list : docs) {
                tokens.addAll(list);
            }
            for (String string : tokens) {
                double count = 0.0;
                for (List list : docs) {
                    if (!list.contains(string)) continue;
                    count += 1.0;
                }
                idfValues.put(string, count);
            }
            for (String string : idfValues.keySet()) {
                double idf = Math.log10((double)lines.size() / (Double)idfValues.get(string));
                idfValues.put(string, idf);
            }
            StringBuilder sb = new StringBuilder();
            for (String key : idfValues.keySet()) {
                sb.append(key + "\t" + idfValues.get(key) + LF);
            }
            FileUtils.writeStringToFile((File)outputFile, (String)sb.toString(), (Charset)StandardCharsets.UTF_8);
            System.out.println(" - done");
        }
    }

    private static Collection<Lemma> getLemmas(String fileContents) throws Exception {
        if (lemmaEngine == null) {
            lemmaEngine = AnalysisEngineFactory.createEngine((AnalysisEngineDescription)AnalysisEngineFactory.createEngineDescription((AnalysisEngineDescription[])new AnalysisEngineDescription[]{AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class, (Object[])new Object[0]), AnalysisEngineFactory.createEngineDescription(OpenNlpPosTagger.class, (Object[])new Object[]{"language", "en"}), AnalysisEngineFactory.createEngineDescription(StanfordLemmatizer.class, (Object[])new Object[0])}), (Object[])new Object[0]);
            lemmaJCas = JCasFactory.createJCas();
        }
        lemmaJCas.reset();
        lemmaJCas.setDocumentLanguage("en");
        lemmaJCas.setDocumentText(fileContents);
        lemmaEngine.process(lemmaJCas);
        Collection lemmas = JCasUtil.select((JCas)lemmaJCas, Lemma.class);
        return lemmas;
    }
}

