/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.classifier.bayes;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.GenericsUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.mahout.analysis.WikipediaAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WikipediaDatasetCreatorMapper
extends Mapper<LongWritable, Text, Text, Text> {
    private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
    private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
    private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
    private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
    private List<String> inputCategories;
    private List<Pattern> inputCategoryPatterns;
    private boolean exactMatchOnly;
    private Analyzer analyzer;

    protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException {
        String document = value.toString();
        String catMatch = this.findMatchingCategory(document);
        if (!"Unknown".equals(catMatch)) {
            StringBuilder contents = new StringBuilder(1000);
            document = StringEscapeUtils.unescapeHtml((String)CLOSE_TEXT_TAG_PATTERN.matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
            TokenStream stream = this.analyzer.tokenStream(catMatch, (Reader)new StringReader(document));
            CharTermAttribute termAtt = (CharTermAttribute)stream.addAttribute(CharTermAttribute.class);
            while (stream.incrementToken()) {
                contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
            }
            context.write((Object)new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), (Object)new Text(contents.toString()));
        }
    }

    protected void setup(Mapper.Context context) throws IOException, InterruptedException {
        super.setup(context);
        Configuration conf = context.getConfiguration();
        if (this.inputCategories == null) {
            HashSet newCategories = new HashSet();
            DefaultStringifier setStringifier = new DefaultStringifier(conf, GenericsUtil.getClass(newCategories));
            String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories));
            Set inputCategoriesSet = (Set)setStringifier.fromString(categoriesStr);
            this.inputCategories = new ArrayList<String>(inputCategoriesSet);
            this.inputCategoryPatterns = new ArrayList<Pattern>(this.inputCategories.size());
            for (String inputCategory : this.inputCategories) {
                this.inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + "\\b.*"));
            }
        }
        this.exactMatchOnly = conf.getBoolean("exact.match.only", false);
        if (this.analyzer == null) {
            try {
                String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName());
                Class<Analyzer> analyzerClass = Class.forName(analyzerStr).asSubclass(Analyzer.class);
                this.analyzer = analyzerClass.newInstance();
            }
            catch (ClassNotFoundException e) {
                throw new IllegalStateException(e);
            }
            catch (IllegalAccessException e) {
                throw new IllegalStateException(e);
            }
            catch (InstantiationException e) {
                throw new IllegalStateException(e);
            }
        }
        log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}", new Object[]{this.inputCategories.size(), this.exactMatchOnly, this.analyzer.getClass().getName()});
    }

    private String findMatchingCategory(String document) {
        int endIndex;
        int categoryIndex;
        int startIndex = 0;
        while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1 && (endIndex = document.indexOf("]]", categoryIndex += 11)) < document.length() && endIndex >= 0) {
            String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
            if (this.exactMatchOnly && this.inputCategories.contains(category)) {
                return category;
            }
            if (!this.exactMatchOnly) {
                for (int i = 0; i < this.inputCategories.size(); ++i) {
                    String inputCategory = this.inputCategories.get(i);
                    Pattern inputCategoryPattern = this.inputCategoryPatterns.get(i);
                    if (!inputCategoryPattern.matcher(category).matches()) continue;
                    return inputCategory;
                }
            }
            startIndex = endIndex;
        }
        return "Unknown";
    }
}

