package org.wikibrain.sr.wikify;

import gnu.trove.TCollections;
import gnu.trove.map.TIntObjectMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.model.LocalLink;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.core.nlp.Dictionary;
import org.wikibrain.core.nlp.StringTokenizer;
import org.wikibrain.core.nlp.Token;
import org.wikibrain.utils.ParallelForEach;
import org.wikibrain.utils.Procedure;
import org.wikibrain.utils.WpIOUtils;

/* loaded from: input_file:org/wikibrain/sr/wikify/BaseCorpusCreator.class */
public abstract class BaseCorpusCreator {
    private static final Logger LOG = Logger.getLogger(BaseCorpusCreator.class.getName());
    private final Language language;
    private final Wikifier wikifier;
    private final LocalPageDao pageDao;
    private Dictionary dictionary;
    private BufferedWriter corpus;
    private final StringTokenizer tokenizer = new StringTokenizer();
    private TIntObjectMap<String> mentionUrls = TCollections.synchronizedMap(new TIntObjectHashMap());

    public BaseCorpusCreator(Language language, LocalPageDao localPageDao, Wikifier wikifier) {
        this.language = language;
        this.pageDao = localPageDao;
        this.wikifier = wikifier;
    }

    public abstract Iterator<IdAndText> getCorpus() throws DaoException;

    public void write(File file) throws IOException, DaoException {
        if (file.exists()) {
            FileUtils.deleteQuietly(file);
        }
        file.mkdirs();
        this.dictionary = new Dictionary(this.language, Dictionary.WordStorage.ON_DISK);
        this.corpus = WpIOUtils.openWriter(new File(file, "corpus.txt"));
        ParallelForEach.iterate(getCorpus(), new Procedure<IdAndText>() { // from class: org.wikibrain.sr.wikify.BaseCorpusCreator.1
            public void call(IdAndText idAndText) throws Exception {
                BaseCorpusCreator.this.processText(idAndText);
            }
        });
        this.corpus.close();
        this.dictionary.write(new File(file, "dictionary.txt"));
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void processText(IdAndText idAndText) throws IOException, DaoException {
        List<LocalLink> wikify = idAndText.getId() >= 0 ? this.wikifier.wikify(idAndText.getId(), idAndText.getText()) : this.wikifier.wikify(idAndText.getText());
        StringBuilder sb = new StringBuilder();
        Iterator it = this.tokenizer.getSentenceTokens(this.language, idAndText.getText()).iterator();
        while (it.hasNext()) {
            String processSentence = processSentence((Token) it.next(), wikify);
            if (processSentence != null) {
                sb.append(processSentence);
            }
        }
        synchronized (this.corpus) {
            this.corpus.write(sb.toString() + "\n\n");
        }
        countTokens(sb.toString());
    }

    private void countTokens(String str) throws IOException {
        this.dictionary.countNormalizedText(str);
    }

    private String processSentence(Token token, List<LocalLink> list) throws IOException, DaoException {
        List wordTokens = this.tokenizer.getWordTokens(this.language, token);
        if (wordTokens.isEmpty()) {
            return null;
        }
        StringBuilder sb = new StringBuilder();
        int i = 0;
        int i2 = 0;
        while (i2 < wordTokens.size()) {
            Token token2 = (Token) wordTokens.get(i2);
            while (i < list.size() && list.get(i).getLocation() < token2.getBegin()) {
                i++;
            }
            String token3 = token2.getToken();
            if (i < list.size() && list.get(i).getLocation() < token2.getEnd()) {
                int location = list.get(i).getLocation() + list.get(i).getAnchorText().length();
                while (i2 + 1 < wordTokens.size() && ((Token) wordTokens.get(i2 + 1)).getBegin() < location) {
                    if (token3.length() > 0) {
                        token3 = token3 + "_";
                    }
                    i2++;
                    token3 = token3 + ((Token) wordTokens.get(i2)).getToken();
                }
                token3 = token3 + ":" + getMentionUrl(list.get(i).getDestId());
            }
            String trim = token3.trim();
            if (trim.length() != 0) {
                if (trim.contains("\n")) {
                    throw new IllegalStateException();
                }
                if (sb.length() > 0) {
                    sb.append(' ');
                }
                sb.append(trim);
            }
            i2++;
        }
        sb.append('\n');
        return sb.toString();
    }

    private String getMentionUrl(int i) throws DaoException {
        if (!this.mentionUrls.containsKey(i)) {
            LocalPage byId = this.pageDao.getById(this.language, i);
            if (byId == null) {
                this.mentionUrls.put(i, "/w/" + this.language.getLangCode() + "/-1/Unknown_page");
            } else {
                this.mentionUrls.put(i, byId.getCompactUrl());
            }
        }
        return (String) this.mentionUrls.get(i);
    }
}
