package org.wikibrain.sr.wikify;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.logging.Logger;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.DaoFilter;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.model.RawPage;

/* loaded from: input_file:org/wikibrain/sr/wikify/WikiTextCorpusCreator.class */
public class WikiTextCorpusCreator extends BaseCorpusCreator {
    private static final Logger LOG = Logger.getLogger(WikiTextCorpusCreator.class.getName());
    private final Language language;
    private final RawPageDao dao;
    private int maxPages;

    /* loaded from: input_file:org/wikibrain/sr/wikify/WikiTextCorpusCreator$RawPageTextIterator.class */
    public static class RawPageTextIterator implements Iterator<IdAndText> {
        private final Iterator<RawPage> iter;
        private static IdAndText buffer = null;

        public RawPageTextIterator(Iterator<RawPage> it) {
            this.iter = it;
            fillBuffer();
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            return buffer != null;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.Iterator
        public IdAndText next() {
            IdAndText idAndText = buffer;
            if (buffer != null) {
                buffer = null;
                fillBuffer();
            }
            return idAndText;
        }

        @Override // java.util.Iterator
        public void remove() {
            throw new UnsupportedOperationException();
        }

        private void fillBuffer() {
            while (buffer == null && this.iter.hasNext()) {
                RawPage next = this.iter.next();
                if (next != null) {
                    try {
                        String plainText = next.getPlainText();
                        if (plainText != null && plainText.trim().length() > 0) {
                            buffer = new IdAndText(next.getLocalId(), plainText.trim());
                        }
                    } catch (Exception e) {
                        WikiTextCorpusCreator.LOG.warning("Error when extracting text from: " + next.getTitle());
                    }
                }
            }
        }
    }

    public WikiTextCorpusCreator(Language language, Wikifier wikifier, RawPageDao rawPageDao, LocalPageDao localPageDao) {
        super(language, localPageDao, wikifier);
        this.maxPages = Integer.MAX_VALUE;
        this.language = language;
        this.dao = rawPageDao;
    }

    public void setMaxPages(int i) {
        this.maxPages = i;
    }

    @Override // org.wikibrain.sr.wikify.BaseCorpusCreator
    public Iterator<IdAndText> getCorpus() throws DaoException {
        return new RawPageTextIterator(this.dao.get(new DaoFilter().setRedirect(false).setDisambig(false).setLanguages(this.language).setLimit(this.maxPages)).iterator());
    }

    public static void main(String[] strArr) throws ConfigurationException, IOException, DaoException {
        Options options = new Options();
        options.addOption(new DefaultOptionBuilder().hasArg().isRequired().withLongOpt("output").withDescription("corpus output directory (existing data will be lost)").create("o"));
        options.addOption(new DefaultOptionBuilder().hasArg().withLongOpt("max-articles").withDescription("Maximum number of articles to process").create("x"));
        EnvBuilder.addStandardOptions(options);
        try {
            CommandLine parse = new PosixParser().parse(options, strArr);
            Env build = new EnvBuilder(parse).build();
            RawPageDao rawPageDao = (RawPageDao) build.getConfigurator().get(RawPageDao.class);
            LocalPageDao localPageDao = (LocalPageDao) build.getConfigurator().get(LocalPageDao.class);
            Language defaultLanguage = build.getLanguages().getDefaultLanguage();
            WikiTextCorpusCreator wikiTextCorpusCreator = new WikiTextCorpusCreator(defaultLanguage, (Wikifier) build.getConfigurator().get(Wikifier.class, "default", "language", defaultLanguage.getLangCode()), rawPageDao, localPageDao);
            if (parse.hasOption("x")) {
                wikiTextCorpusCreator.setMaxPages(Integer.valueOf(parse.getOptionValue("x")).intValue());
            }
            wikiTextCorpusCreator.write(new File(parse.getOptionValue("o")));
        } catch (ParseException e) {
            System.err.println("Invalid option usage: " + e.getMessage());
            new HelpFormatter().printHelp("WikiTextCorpusCreator", options);
        }
    }
}
