package org.apache.uima.ruta.engine;

import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.ruta.visitor.CreatedByVisitor;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.visitors.TextExtractingVisitor;

/* loaded from: input_file:org/apache/uima/ruta/engine/HtmlConverterVisitor.class */
public class HtmlConverterVisitor extends TextExtractingVisitor {
    private boolean skipWhitespace;
    private Collection<String> newlineInducingTags;
    private boolean processAll;
    private List<String> gapInducingTags;
    private String gapText;
    private Pattern newlineInducingTagPattern;
    private boolean inBody = false;
    private boolean inScript = false;
    private SortedSet<HtmlConverterPSpan> textSpans = new TreeSet();
    private SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet();
    private SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet();

    public HtmlConverterVisitor(String[] strArr, String str, String[] strArr2, String str2, boolean z, boolean z2) {
        this.skipWhitespace = true;
        this.processAll = true;
        if (strArr != null) {
            this.newlineInducingTags = Arrays.asList(strArr);
        }
        if (strArr2 != null) {
            this.gapInducingTags = Arrays.asList(strArr2);
        }
        this.gapText = str2;
        this.skipWhitespace = z;
        this.processAll = z2;
        if (str != null) {
            this.newlineInducingTagPattern = Pattern.compile(str);
        }
    }

    public void visitStringNode(Text text) {
        super.visitStringNode(text);
        if ((this.processAll || this.inBody) && !this.inScript) {
            if (this.skipWhitespace && StringUtils.isBlank(text.getText())) {
                return;
            }
            this.textSpans.add(new HtmlConverterPSpan(text.getStartPosition(), text.getEndPosition(), text.getText()));
        }
    }

    public void visitTag(Tag tag) {
        super.visitTag(tag);
        String trim = tag.getTagName().toLowerCase().trim();
        if (trim.equals("body")) {
            this.inBody = true;
        } else if (trim.equals(CreatedByVisitor.FEATURE_SCRIPT)) {
            this.inScript = true;
        }
        boolean z = false;
        if (this.newlineInducingTagPattern != null && this.newlineInducingTagPattern.matcher(trim).matches()) {
            z = true;
        }
        if (z || (this.newlineInducingTags != null && this.newlineInducingTags.contains(trim))) {
            int startPosition = tag.getStartPosition();
            this.linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(startPosition, startPosition + 1, HtmlConverter.LINEBREAK));
        }
        if (this.gapInducingTags == null || !this.gapInducingTags.contains(trim)) {
            return;
        }
        int startPosition2 = tag.getStartPosition();
        this.gapsFromHtmlTags.add(new HtmlConverterPSpanReplacement(startPosition2, startPosition2 + this.gapText.length(), this.gapText));
    }

    public void visitEndTag(Tag tag) {
        String trim = tag.getTagName().toLowerCase().trim();
        if (trim.equals("body")) {
            this.inBody = false;
        } else if (trim.equals(CreatedByVisitor.FEATURE_SCRIPT) || (tag instanceof ScriptTag)) {
            this.inScript = false;
        }
    }

    public SortedSet<HtmlConverterPSpan> getTextSpans() {
        return this.textSpans;
    }

    public SortedSet<HtmlConverterPSpan> getLinebreaksFromHtmlTags() {
        return this.linebreaksFromHtmlTags;
    }

    public SortedSet<HtmlConverterPSpan> getGapsFromHtmlTags() {
        return this.gapsFromHtmlTags;
    }
}
