package org.wikibrain.sr.word2vec.universal;

import gnu.trove.map.TIntIntMap;
import gnu.trove.map.TIntObjectMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Matcher;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.dao.UniversalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.core.nlp.Dictionary;
import org.wikibrain.download.FileDownloader;
import org.wikibrain.phrases.LinkProbabilityDao;
import org.wikibrain.sr.SRBuilder;
import org.wikibrain.sr.wikify.Corpus;
import org.wikibrain.sr.wikify.PlainTextCorpusCreator;
import org.wikibrain.sr.wikify.WbCorpusLineReader;
import org.wikibrain.sr.wikify.WebSailWikifier;
import org.wikibrain.sr.wikify.WikiTextCorpusCreator;
import org.wikibrain.sr.wikify.Wikifier;
import org.wikibrain.utils.WpIOUtils;

/* loaded from: input_file:org/wikibrain/sr/word2vec/universal/UniversalWord2VecMain.class */
public class UniversalWord2VecMain {
    private static final int OPTIMAL_FILE_SIZE = 52428800;
    private final Language lang;
    private final Env env;
    private final TIntIntMap concepts;
    private final LocalPageDao pageDao;
    private final TIntObjectMap<String> shortUrls = new TIntObjectHashMap();
    private static final Logger LOG = LoggerFactory.getLogger(UniversalWord2VecMain.class);
    private static final String[][] CORPORA = {new String[]{"simple", "http://shilad.com/news.2007.en.shuffled.gz"}, new String[]{"cs", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.cs.shuffled.gz"}, new String[]{"de", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.de.shuffled.gz"}, new String[]{"en", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz"}, new String[]{"es", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.es.shuffled.gz"}, new String[]{"fr", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.fr.shuffled.gz"}, new String[]{"hi", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.hi.shuffled.gz"}, new String[]{"ru", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.ru.shuffled.gz"}, new String[]{"cs", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.cs.shuffled.gz"}, new String[]{"de", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.de.shuffled.gz"}, new String[]{"en", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz"}, new String[]{"es", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.es.shuffled.gz"}, new String[]{"fr", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.fr.shuffled.gz"}, new String[]{"hi", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.hi.shuffled.gz"}, new String[]{"ru", "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.ru.shuffled.gz"}};

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/wikibrain/sr/word2vec/universal/UniversalWord2VecMain$RotatingWriter.class */
    public static class RotatingWriter implements Closeable {
        private final String prefix;
        private final String suffix;
        private final int maxBytes;
        private int fileNum = 0;
        private int numBytes = 0;
        private BufferedWriter writer = null;

        RotatingWriter(String str, String str2, int i) {
            this.prefix = str;
            this.suffix = str2;
            this.maxBytes = i;
        }

        void write(String str) throws IOException {
            possiblyRotateWriter();
            this.numBytes += str.getBytes("UTF-8").length;
            this.writer.write(str);
        }

        private void possiblyRotateWriter() throws IOException {
            if (this.writer == null || this.numBytes >= this.maxBytes) {
                if (this.writer != null) {
                    close();
                    this.fileNum++;
                    this.numBytes = 0;
                }
                this.writer = WpIOUtils.openWriter(String.format("%s%05d%s", this.prefix, Integer.valueOf(this.fileNum), this.suffix));
            }
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() throws IOException {
            if (this.writer != null) {
                IOUtils.closeQuietly(this.writer);
                this.writer = null;
            }
        }
    }

    public UniversalWord2VecMain(Env env, Language language) throws ConfigurationException, DaoException {
        this.env = env;
        this.lang = language;
        UniversalPageDao universalPageDao = (UniversalPageDao) env.getConfigurator().get(UniversalPageDao.class);
        this.pageDao = (LocalPageDao) env.getComponent(LocalPageDao.class);
        Map allLocalToUnivIdsMap = universalPageDao.getAllLocalToUnivIdsMap(new LanguageSet(language));
        this.concepts = allLocalToUnivIdsMap.containsKey(language) ? (TIntIntMap) allLocalToUnivIdsMap.get(language) : new TIntIntHashMap();
    }

    public void create(String str) throws ConfigurationException, DaoException, WikiBrainException, IOException, InterruptedException {
        SRBuilder sRBuilder = new SRBuilder(this.env, "word2vec", this.lang);
        sRBuilder.setSkipBuiltMetrics(true);
        sRBuilder.setCreateFakeGoldStandard(true);
        sRBuilder.build();
        FileUtils.deleteQuietly(new File(str));
        FileUtils.forceMkdir(new File(str));
        Corpus corpus = (Corpus) this.env.getConfigurator().get(Corpus.class, "wikified", "language", this.lang.getLangCode());
        if (corpus == null) {
            throw new IllegalStateException("Couldn't find wikified corpus for language " + this.lang);
        }
        if (!corpus.exists()) {
            corpus.create();
        }
        RotatingWriter rotatingWriter = new RotatingWriter(str + "/corpus." + this.lang.getLangCode() + ".", ".txt", OPTIMAL_FILE_SIZE);
        RawPageDao rawPageDao = (RawPageDao) this.env.getConfigurator().get(RawPageDao.class);
        LocalPageDao localPageDao = (LocalPageDao) this.env.getConfigurator().get(LocalPageDao.class);
        Wikifier wikifier = (Wikifier) this.env.getComponent(Wikifier.class, "websail-final", this.lang);
        ((WebSailWikifier) wikifier).setMinFinalScore(1.0E-5d);
        ((WebSailWikifier) wikifier).setDesiredLinkRecall(0.995d);
        LinkProbabilityDao linkProbabilityDao = (LinkProbabilityDao) this.env.getComponent(LinkProbabilityDao.class, this.lang);
        File createTempDirectory = WpIOUtils.createTempDirectory(this.lang.getLangCode() + "corpora");
        File file = new File(createTempDirectory, "wikipedia");
        new WikiTextCorpusCreator(this.lang, wikifier, rawPageDao, localPageDao, linkProbabilityDao).write(file);
        FileUtils.forceDeleteOnExit(file);
        Iterator<WbCorpusLineReader.Line> it = new WbCorpusLineReader(new File(file, "corpus.txt")).iterator();
        while (it.hasNext()) {
            WbCorpusLineReader.Line next = it.next();
            processLine(rotatingWriter, next.getLine(), next.getDocId(), next.getLineNumber(), next.getCharNumber());
        }
        FileUtils.deleteQuietly(file);
        for (String[] strArr : CORPORA) {
            if (strArr[0].equals(this.lang.getLangCode())) {
                URL url = new URL(strArr[1]);
                File download = new FileDownloader().download(url, new File(createTempDirectory, new File(url.getFile()).getName()));
                download.deleteOnExit();
                File file2 = new File(download.toString().replace(".gz", "") + ".wikified");
                new PlainTextCorpusCreator(this.lang, wikifier, localPageDao, linkProbabilityDao, download).write(file2);
                Iterator<WbCorpusLineReader.Line> it2 = new WbCorpusLineReader(new File(file2, "corpus.txt")).iterator();
                while (it2.hasNext()) {
                    processLine(rotatingWriter, it2.next().getLine(), -1, -1, -1);
                }
            }
        }
        FileUtils.deleteQuietly(createTempDirectory);
        rotatingWriter.close();
    }

    protected String getShortUrl(int i) throws IOException {
        if (i < 0) {
            return null;
        }
        synchronized (this.shortUrls) {
            if (this.shortUrls.containsKey(i)) {
                String str = (String) this.shortUrls.get(i);
                return str.isEmpty() ? null : str;
            }
            try {
                LocalPage byId = this.pageDao.getById(this.lang, i);
                synchronized (this.shortUrls) {
                    if (byId == null) {
                        this.shortUrls.put(i, "");
                        return null;
                    }
                    String compactUrl = byId.getCompactUrl();
                    this.shortUrls.put(i, compactUrl);
                    return compactUrl;
                }
            } catch (DaoException e) {
                throw new IOException((Throwable) e);
            }
        }
    }

    private void processLine(RotatingWriter rotatingWriter, String str, int i, int i2, int i3) throws IOException {
        ArrayList arrayList = new ArrayList();
        for (String str2 : str.split(" +")) {
            int indexOf = str2.indexOf(":/w/");
            if (indexOf >= 0) {
                Matcher matcher = Dictionary.PATTERN_MENTION.matcher(str2.substring(indexOf));
                if (matcher.matches()) {
                    ArrayList arrayList2 = new ArrayList();
                    arrayList2.add(makeWordToken(str2.substring(0, indexOf)));
                    int intValue = Integer.valueOf(matcher.group(3)).intValue();
                    if (intValue >= 0) {
                        arrayList2.add(str2.substring(indexOf + 1));
                        if (this.concepts.containsKey(intValue)) {
                            arrayList2.add("/c/" + this.concepts.get(intValue));
                        }
                        Collections.shuffle(arrayList2);
                    }
                    arrayList.addAll(arrayList2);
                } else {
                    arrayList.add(makeWordToken(str2));
                }
            } else {
                arrayList.add(makeWordToken(str2));
            }
        }
        ArrayList arrayList3 = new ArrayList();
        String shortUrl = getShortUrl(i);
        if (shortUrl != null) {
            arrayList3.add(shortUrl);
        }
        if (this.concepts.containsKey(i)) {
            arrayList3.add("/c/" + this.concepts.get(i));
        }
        rotatingWriter.write(i2 + "\t" + i3 + "\t");
        rotatingWriter.write(StringUtils.join(arrayList3, " ") + "\t");
        rotatingWriter.write(StringUtils.join(arrayList, " ") + "\n");
    }

    private String makeWordToken(String str) {
        return this.lang.getLangCode() + ":" + str;
    }

    public static void main(String[] strArr) throws ConfigurationException, DaoException, IOException, WikiBrainException, InterruptedException {
        Options options = new Options();
        options.addOption(new DefaultOptionBuilder().hasArg().isRequired().withLongOpt("output").withDescription("corpus output directory (existing data will be lost)").create("o"));
        EnvBuilder.addStandardOptions(options);
        try {
            CommandLine parse = new PosixParser().parse(options, strArr);
            Env build = new EnvBuilder(parse).build();
            Iterator it = build.getLanguages().iterator();
            while (it.hasNext()) {
                Language language = (Language) it.next();
                try {
                    LOG.info("Generating corpus for language " + language);
                    UniversalWord2VecMain universalWord2VecMain = new UniversalWord2VecMain(build, language);
                    String str = parse.getOptionValue("o") + "/" + language.getLangCode();
                    File file = new File(str);
                    if (!file.isDirectory() || file.list().length == 0) {
                        universalWord2VecMain.create(str);
                    }
                } catch (Exception e) {
                    LOG.warn("Generation of corpus for language " + language + " failed", e);
                }
            }
        } catch (ParseException e2) {
            System.err.println("Invalid option usage: " + e2.getMessage());
            new HelpFormatter().printHelp("UniversalWord2VecMain", options);
        }
    }
}
