package edu.umd.cloud9.collection.wikipedia.graph;

import edu.umd.cloud9.collection.wikipedia.WikipediaPage;
import edu.umd.cloud9.io.map.HMapSIW;
import edu.umd.cloud9.io.pair.PairOfIntString;
import edu.umd.cloud9.io.pair.PairOfStringInt;
import edu.umd.cloud9.io.pair.PairOfStrings;
import java.io.IOException;
import java.util.Iterator;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/graph/ExtractWikipediaAnchorText.class */
public class ExtractWikipediaAnchorText extends Configured implements Tool {
    private static final Logger LOG = Logger.getLogger(ExtractWikipediaAnchorText.class);
    private static final String INPUT_OPTION = "input";
    private static final String OUTPUT_OPTION = "output";

    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/graph/ExtractWikipediaAnchorText$MyMapper1.class */
    private static class MyMapper1 extends MapReduceBase implements Mapper<IntWritable, WikipediaPage, PairOfStringInt, PairOfStrings> {
        private static final PairOfStringInt KEYPAIR = new PairOfStringInt();
        private static final PairOfStrings VALUEPAIR = new PairOfStrings();

        private MyMapper1() {
        }

        public void map(IntWritable intWritable, WikipediaPage wikipediaPage, OutputCollector<PairOfStringInt, PairOfStrings> outputCollector, Reporter reporter) throws IOException {
            reporter.incrCounter(PageTypes.TOTAL, 1L);
            String title = wikipediaPage.getTitle();
            VALUEPAIR.set(wikipediaPage.getDocid(), "");
            KEYPAIR.set(title, 0);
            outputCollector.collect(KEYPAIR, VALUEPAIR);
            String substring = title.substring(0, 1);
            if (substring.matches("[A-Z]")) {
                KEYPAIR.set(title.replaceFirst(substring, substring.toLowerCase()), 0);
                outputCollector.collect(KEYPAIR, VALUEPAIR);
            }
            if (wikipediaPage.isRedirect()) {
                reporter.incrCounter(PageTypes.REDIRECT, 1L);
            } else if (wikipediaPage.isDisambiguation()) {
                reporter.incrCounter(PageTypes.DISAMBIGUATION, 1L);
            } else if (wikipediaPage.isEmpty()) {
                reporter.incrCounter(PageTypes.EMPTY, 1L);
            } else if (wikipediaPage.isArticle()) {
                reporter.incrCounter(PageTypes.ARTICLE, 1L);
                if (wikipediaPage.isStub()) {
                    reporter.incrCounter(PageTypes.STUB, 1L);
                }
            } else {
                reporter.incrCounter(PageTypes.NON_ARTICLE, 1L);
            }
            for (WikipediaPage.Link link : wikipediaPage.extractLinks()) {
                KEYPAIR.set(link.getTarget(), 1);
                VALUEPAIR.set(wikipediaPage.getDocid(), link.getAnchorText());
                outputCollector.collect(KEYPAIR, VALUEPAIR);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (WikipediaPage) obj2, (OutputCollector<PairOfStringInt, PairOfStrings>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/graph/ExtractWikipediaAnchorText$MyMapper2.class */
    private static class MyMapper2 extends MapReduceBase implements Mapper<IntWritable, PairOfIntString, IntWritable, Text> {
        private static final IntWritable KEY = new IntWritable();
        private static final Text VALUE = new Text();

        private MyMapper2() {
        }

        public void map(IntWritable intWritable, PairOfIntString pairOfIntString, OutputCollector<IntWritable, Text> outputCollector, Reporter reporter) throws IOException {
            KEY.set(pairOfIntString.getLeftElement());
            VALUE.set(pairOfIntString.getRightElement());
            outputCollector.collect(KEY, VALUE);
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (PairOfIntString) obj2, (OutputCollector<IntWritable, Text>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/graph/ExtractWikipediaAnchorText$MyPartitioner1.class */
    private static class MyPartitioner1 implements Partitioner<PairOfStringInt, PairOfStrings> {
        private MyPartitioner1() {
        }

        public void configure(JobConf jobConf) {
        }

        public int getPartition(PairOfStringInt pairOfStringInt, PairOfStrings pairOfStrings, int i) {
            return (pairOfStringInt.getLeftElement().hashCode() & Integer.MAX_VALUE) % i;
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/graph/ExtractWikipediaAnchorText$MyReducer1.class */
    private static class MyReducer1 extends MapReduceBase implements Reducer<PairOfStringInt, PairOfStrings, IntWritable, PairOfIntString> {
        private static final IntWritable SRCID = new IntWritable();
        private static final PairOfIntString TARGET_ANCHOR_PAIR = new PairOfIntString();
        private String targetTitle;
        private int targetDocid;

        private MyReducer1() {
        }

        public void reduce(PairOfStringInt pairOfStringInt, Iterator<PairOfStrings> it, OutputCollector<IntWritable, PairOfIntString> outputCollector, Reporter reporter) throws IOException {
            if (pairOfStringInt.getRightElement() == 0) {
                this.targetTitle = pairOfStringInt.getLeftElement();
                this.targetDocid = Integer.parseInt(it.next().getLeftElement());
            } else if (pairOfStringInt.getLeftElement().equals(this.targetTitle)) {
                while (it.hasNext()) {
                    PairOfStrings next = it.next();
                    SRCID.set(Integer.parseInt(next.getLeftElement()));
                    TARGET_ANCHOR_PAIR.set(this.targetDocid, next.getRightElement());
                    outputCollector.collect(SRCID, TARGET_ANCHOR_PAIR);
                }
            }
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((PairOfStringInt) obj, (Iterator<PairOfStrings>) it, (OutputCollector<IntWritable, PairOfIntString>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/graph/ExtractWikipediaAnchorText$MyReducer2.class */
    private static class MyReducer2 extends MapReduceBase implements Reducer<IntWritable, Text, IntWritable, HMapSIW> {
        private static final HMapSIW map = new HMapSIW();

        private MyReducer2() {
        }

        public void reduce(IntWritable intWritable, Iterator<Text> it, OutputCollector<IntWritable, HMapSIW> outputCollector, Reporter reporter) throws IOException {
            map.clear();
            while (it.hasNext()) {
                map.increment(it.next().toString());
            }
            outputCollector.collect(intWritable, map);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((IntWritable) obj, (Iterator<Text>) it, (OutputCollector<IntWritable, HMapSIW>) outputCollector, reporter);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/graph/ExtractWikipediaAnchorText$PageTypes.class */
    public enum PageTypes {
        TOTAL,
        REDIRECT,
        DISAMBIGUATION,
        EMPTY,
        ARTICLE,
        STUB,
        NON_ARTICLE
    }

    public int run(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("input");
        options.addOption(OptionBuilder.create("input"));
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("output for adjacency list");
        options.addOption(OptionBuilder.create("output"));
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            if (!parse.hasOption("input") || !parse.hasOption("output")) {
                new HelpFormatter().printHelp(getClass().getName(), options);
                ToolRunner.printGenericCommandUsage(System.out);
                return -1;
            }
            String str = "tmp-" + getClass().getCanonicalName() + "-" + new Random().nextInt(10000);
            task1(parse.getOptionValue("input"), str);
            task2(str, parse.getOptionValue("output"));
            return 0;
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            return -1;
        }
    }

    private void task1(String str, String str2) throws IOException {
        LOG.info("Exracting anchor text (phase 1)...");
        LOG.info(" - input: " + str);
        LOG.info(" - output: " + str2);
        JobConf jobConf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
        jobConf.setJobName(String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", str, str2));
        jobConf.setNumReduceTasks(10);
        SequenceFileInputFormat.addInputPath(jobConf, new Path(str));
        TextOutputFormat.setOutputPath(jobConf, new Path(str2));
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        jobConf.setMapOutputKeyClass(PairOfStringInt.class);
        jobConf.setMapOutputValueClass(PairOfStrings.class);
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(PairOfIntString.class);
        jobConf.setMapperClass(MyMapper1.class);
        jobConf.setReducerClass(MyReducer1.class);
        jobConf.setPartitionerClass(MyPartitioner1.class);
        FileSystem.get(jobConf).delete(new Path(str2), true);
        JobClient.runJob(jobConf);
    }

    private void task2(String str, String str2) throws IOException {
        LOG.info("Exracting anchor text (phase 2)...");
        LOG.info(" - input: " + str);
        LOG.info(" - output: " + str2);
        JobConf jobConf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
        jobConf.setJobName(String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", str, str2));
        jobConf.setNumReduceTasks(1);
        TextInputFormat.addInputPath(jobConf, new Path(str));
        TextOutputFormat.setOutputPath(jobConf, new Path(str2));
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(MapFileOutputFormat.class);
        jobConf.setMapOutputKeyClass(IntWritable.class);
        jobConf.setMapOutputValueClass(Text.class);
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(HMapSIW.class);
        jobConf.setMapperClass(MyMapper2.class);
        jobConf.setReducerClass(MyReducer2.class);
        FileSystem.get(jobConf).delete(new Path(str2), true);
        JobClient.runJob(jobConf);
        FileSystem.get(jobConf).delete(new Path(str), true);
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(new ExtractWikipediaAnchorText(), strArr));
    }
}
