package edu.umd.cloud9.webgraph;

import edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping;
import edu.umd.cloud9.collection.clue.ClueWarcRecord;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;
import java.io.IOException;
import java.io.UTFDataFormatException;
import java.net.URI;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/* loaded from: input_file:edu/umd/cloud9/webgraph/ClueExtractLinks.class */
public class ClueExtractLinks extends PowerTool {
    private static final Logger LOG = Logger.getLogger(ClueExtractLinks.class);
    public static final String[] RequiredParameters = {"Cloud9.InputPath", "Cloud9.OutputPath", "Cloud9.Mappers", "Cloud9.Reducers", "Cloud9.DocnoMappingFile", "Cloud9.IncludeInternalLinks", "Cloud9.AnchorTextNormalizer"};

    /* loaded from: input_file:edu/umd/cloud9/webgraph/ClueExtractLinks$Map.class */
    public static class Map extends MapReduceBase implements Mapper<IntWritable, ClueWarcRecord, Text, ArrayListWritable<AnchorText>> {
        private static String base;
        private static String baseHost;
        private static int docno;
        private static final Text keyWord = new Text();
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<>();
        private static final ClueWarcDocnoMapping docnoMapping = new ClueWarcDocnoMapping();
        private static final Parser parser = new Parser();
        private static final NodeFilter filter = new NodeClassFilter(LinkTag.class);
        private static NodeList list;
        private static boolean includeInternalLinks;
        private static AnchorTextNormalizer normalizer;

        /* loaded from: input_file:edu/umd/cloud9/webgraph/ClueExtractLinks$Map$LinkCounter.class */
        public enum LinkCounter {
            INPUT_DOCS,
            OUTPUT_DOCS,
            INVALID_DOCNO,
            INVALID_URL,
            TEXT_TOO_LONG,
            PARSER_FAILED
        }

        public void configure(JobConf jobConf) {
            try {
                try {
                    docnoMapping.loadMapping(DistributedCache.getLocalCacheFiles(jobConf)[0], FileSystem.getLocal(jobConf));
                    includeInternalLinks = jobConf.getBoolean("Cloud9.IncludeInternalLinks", false);
                    try {
                        normalizer = (AnchorTextNormalizer) Class.forName(jobConf.get("Cloud9.AnchorTextNormalizer")).newInstance();
                    } catch (Exception e) {
                        e.printStackTrace();
                        throw new RuntimeException("Error initializing AnchorTextNormalizer");
                    }
                } catch (Exception e2) {
                    e2.printStackTrace();
                    throw new RuntimeException("Error initializing DocnoMapping!");
                }
            } catch (IOException e3) {
                throw new RuntimeException("Local cache files not read properly.");
            }
        }

        /* JADX WARN: Multi-variable type inference failed */
        public void map(IntWritable intWritable, ClueWarcRecord clueWarcRecord, OutputCollector<Text, ArrayListWritable<AnchorText>> outputCollector, Reporter reporter) throws IOException {
            reporter.incrCounter(LinkCounter.INPUT_DOCS, 1L);
            try {
                docno = docnoMapping.getDocno(clueWarcRecord.getHeaderMetadataItem("WARC-TREC-ID"));
                try {
                    base = clueWarcRecord.getHeaderMetadataItem("WARC-Target-URI");
                    if (base == null) {
                        reporter.incrCounter(LinkCounter.INVALID_URL, 1L);
                        return;
                    }
                    arrayList.clear();
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, null, docno));
                    keyWord.set(base);
                    outputCollector.collect(keyWord, arrayList);
                    arrayList.clear();
                    reporter.incrCounter(LinkCounter.OUTPUT_DOCS, 1L);
                    try {
                        baseHost = new URI(base).getHost();
                        if (baseHost == null) {
                            reporter.incrCounter(LinkCounter.INVALID_URL, 1L);
                            return;
                        }
                        try {
                            parser.setInputHTML(clueWarcRecord.getContent());
                            NodeList parse = parser.parse((NodeFilter) null);
                            BaseHrefTag baseHrefTag = new BaseHrefTag();
                            baseHrefTag.setBaseUrl(base);
                            parse.add(baseHrefTag);
                            parser.setInputHTML(parse.toHtml());
                            list = parser.extractAllNodesThatMatch(filter);
                            for (int i = 0; i < list.size(); i++) {
                                LinkTag elementAt = list.elementAt(i);
                                String linkText = elementAt.getLinkText();
                                String extractLink = elementAt.extractLink();
                                if (extractLink != null && !extractLink.equals(base)) {
                                    try {
                                        String host = new URI(extractLink).getHost();
                                        if (host != null) {
                                            if (linkText == null) {
                                                linkText = "";
                                            }
                                            String process = normalizer.process(linkText);
                                            arrayList.clear();
                                            if (!baseHost.equals(host)) {
                                                arrayList.add(new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, process, docno));
                                            } else if (includeInternalLinks) {
                                                arrayList.add(new AnchorText(AnchorTextConstants.Type.INTERNAL_IN_LINK.val, process, docno));
                                            }
                                            try {
                                                keyWord.set(extractLink);
                                                outputCollector.collect(keyWord, arrayList);
                                            } catch (UTFDataFormatException e) {
                                                reporter.incrCounter(LinkCounter.TEXT_TOO_LONG, 1L);
                                                keyWord.set(extractLink);
                                                byte type = ((AnchorText) arrayList.get(0)).getType();
                                                arrayList.clear();
                                                arrayList.add(new AnchorText(type, "", docno));
                                                outputCollector.collect(keyWord, arrayList);
                                            }
                                        }
                                    } catch (Exception e2) {
                                    }
                                }
                            }
                        } catch (StackOverflowError e3) {
                            reporter.incrCounter(LinkCounter.PARSER_FAILED, 1L);
                        } catch (ParserException e4) {
                            reporter.incrCounter(LinkCounter.PARSER_FAILED, 1L);
                        }
                    } catch (Exception e5) {
                        reporter.incrCounter(LinkCounter.INVALID_URL, 1L);
                    }
                } catch (NullPointerException e6) {
                    reporter.incrCounter(LinkCounter.INVALID_URL, 1L);
                }
            } catch (NullPointerException e7) {
                reporter.incrCounter(LinkCounter.INVALID_DOCNO, 1L);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (ClueWarcRecord) obj2, (OutputCollector<Text, ArrayListWritable<AnchorText>>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/webgraph/ClueExtractLinks$Reduce.class */
    public static class Reduce extends MapReduceBase implements Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>> {
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<>();
        private static ArrayListWritable<AnchorText> packet;
        private static boolean pushed;

        /* JADX WARN: Multi-variable type inference failed */
        public void reduce(Text text, Iterator<ArrayListWritable<AnchorText>> it, OutputCollector<Text, ArrayListWritable<AnchorText>> outputCollector, Reporter reporter) throws IOException {
            arrayList.clear();
            while (it.hasNext()) {
                packet = it.next();
                Iterator<E> it2 = packet.iterator();
                while (it2.hasNext()) {
                    AnchorText anchorText = (AnchorText) it2.next();
                    pushed = false;
                    int i = 0;
                    while (true) {
                        if (i >= arrayList.size()) {
                            break;
                        }
                        if (((AnchorText) arrayList.get(i)).equalsIgnoreSources(anchorText)) {
                            ((AnchorText) arrayList.get(i)).addDocumentsFrom(anchorText);
                            pushed = true;
                            break;
                        }
                        i++;
                    }
                    if (!pushed) {
                        arrayList.add(anchorText.m337clone());
                    }
                }
            }
            outputCollector.collect(text, arrayList);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((Text) obj, (Iterator<ArrayListWritable<AnchorText>>) it, (OutputCollector<Text, ArrayListWritable<AnchorText>>) outputCollector, reporter);
        }
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public ClueExtractLinks(Configuration configuration) {
        super(configuration);
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public int runTool() throws Exception {
        JobConf jobConf = new JobConf(getConf(), ClueExtractLinks.class);
        FileSystem fileSystem = FileSystem.get(jobConf);
        int i = jobConf.getInt("Cloud9.Mappers", 1);
        int i2 = jobConf.getInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
        String str = jobConf.get("Cloud9.InputPath");
        String str2 = jobConf.get("Cloud9.OutputPath");
        String str3 = jobConf.get("Cloud9.DocnoMappingFile");
        if (!fileSystem.exists(new Path(str3))) {
            throw new RuntimeException("Error: Docno mapping data file " + str3 + " doesn't exist!");
        }
        DistributedCache.addCacheFile(new URI(str3), jobConf);
        jobConf.setJobName("ClueExtractLinks");
        jobConf.set("mapred.child.java.opts", "-Xmx2048m");
        jobConf.setInt("mapred.task.timeout", 60000000);
        jobConf.set("mapreduce.map.memory.mb", "2048");
        jobConf.set("mapreduce.map.java.opts", "-Xmx2048m");
        jobConf.set("mapreduce.reduce.memory.mb", "2048");
        jobConf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
        jobConf.set("mapreduce.task.timeout", "60000000");
        jobConf.setNumMapTasks(i);
        jobConf.setNumReduceTasks(i2);
        jobConf.setMapperClass(Map.class);
        jobConf.setCombinerClass(Reduce.class);
        jobConf.setReducerClass(Reduce.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(ArrayListWritable.class);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(jobConf, true);
        SequenceFileOutputFormat.setOutputCompressionType(jobConf, SequenceFile.CompressionType.BLOCK);
        SequenceFileInputFormat.setInputPaths(jobConf, str);
        FileOutputFormat.setOutputPath(jobConf, new Path(str2));
        LOG.info("ClueExtractLinks");
        LOG.info(" - input path: " + str);
        LOG.info(" - output path: " + str2);
        LOG.info(" - mapping file: " + str3);
        LOG.info(" - include internal links? " + jobConf.getBoolean("Cloud9.IncludeInternalLinks", false));
        if (fileSystem.exists(new Path(str2))) {
            LOG.info(str2 + " already exists! Skipping this step...");
            return 0;
        }
        JobClient.runJob(jobConf);
        return 0;
    }
}
