package edu.jhu.hlt.concrete.ingesters.conll;

import edu.jhu.hlt.concrete.AnnotationMetadata;
import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Entity;
import edu.jhu.hlt.concrete.EntityMention;
import edu.jhu.hlt.concrete.EntityMentionSet;
import edu.jhu.hlt.concrete.EntitySet;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.Sentence;
import edu.jhu.hlt.concrete.SituationMention;
import edu.jhu.hlt.concrete.SituationMentionSet;
import edu.jhu.hlt.concrete.TextSpan;
import edu.jhu.hlt.concrete.Token;
import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.TokenizationKind;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.stream.StreamBasedStreamIngester;
import edu.jhu.hlt.concrete.ingesters.conll.Conll2011Sentence;
import edu.jhu.hlt.concrete.serialization.TarGzCompactCommunicationSerializer;
import edu.jhu.hlt.concrete.util.Timing;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/jhu/hlt/concrete/ingesters/conll/Conll2011.class */
public class Conll2011 implements StreamBasedStreamIngester {
    static final String SECTION_TYPE = "Passage";
    private final Path ingestPath;
    private final Predicate<Path> keep;
    private static final Logger LOGGER = LoggerFactory.getLogger(Conll2011.class);
    private static final long timestamp = Timing.currentLocalTime();
    private static final Pattern p = Pattern.compile("^#begin document \\((\\S+)\\); part (\\S+)$");
    private static final int kbest = 1;
    static final AnnotationMetadata META_GENERAL = new AnnotationMetadata("conll-2011", timestamp, kbest);
    static final AnnotationMetadata META_COREF = new AnnotationMetadata("conll-2011 coref", timestamp, kbest);
    static final AnnotationMetadata META_PARSE = new AnnotationMetadata("conll-2011 parse", timestamp, kbest);
    static final AnnotationMetadata META_NER = new AnnotationMetadata("conll-2011 NER", timestamp, kbest);
    static final AnnotationMetadata META_POS = new AnnotationMetadata("conll-2011 POS", timestamp, kbest);
    static final AnnotationMetadata META_SRL = new AnnotationMetadata("conll-2011 SRL", timestamp, kbest);
    public boolean addNerAsTokenTagging = true;
    public boolean addNerAsEntityMentionSet = true;
    public boolean includeSingleTokenConstituents = true;
    public boolean includeDebugInfo = false;
    public boolean debug = false;
    public boolean warnOnEmptyCoref = true;

    public Conll2011(Path path, Predicate<Path> predicate) {
        this.ingestPath = path;
        this.keep = predicate;
    }

    public static int count(char c, String str) {
        int i = 0;
        char[] charArray = str.toCharArray();
        int length = charArray.length;
        for (int i2 = 0; i2 < length; i2 += kbest) {
            if (charArray[i2] == c) {
                i += kbest;
            }
        }
        return i;
    }

    public static void mergeInto(Communication communication, Communication communication2) {
        if (communication2.getSectionListSize() != kbest) {
            throw new IllegalArgumentException();
        }
        communication.addToSectionList((Section) communication2.getSectionList().get(0));
        if (communication.getSituationMentionSetList().size() != kbest) {
            throw new IllegalArgumentException();
        }
        if (communication2.getSituationMentionSetList().size() != kbest) {
            throw new IllegalArgumentException();
        }
        SituationMentionSet situationMentionSet = (SituationMentionSet) communication.getSituationMentionSetList().get(0);
        Iterator it = ((SituationMentionSet) communication2.getSituationMentionSetList().get(0)).getMentionList().iterator();
        while (it.hasNext()) {
            situationMentionSet.addToMentionList((SituationMention) it.next());
        }
        if (communication.getEntitySetListSize() != kbest) {
            throw new IllegalArgumentException();
        }
        if (communication2.getEntitySetListSize() != kbest) {
            throw new IllegalArgumentException();
        }
        EntitySet entitySet = (EntitySet) communication.getEntitySetList().get(0);
        Iterator it2 = ((EntitySet) communication2.getEntitySetList().get(0)).getEntityList().iterator();
        while (it2.hasNext()) {
            entitySet.addToEntityList((Entity) it2.next());
        }
        if (communication.getEntityMentionSetListSize() != communication2.getEntityMentionSetListSize()) {
            throw new IllegalArgumentException();
        }
        for (int i = 0; i < communication.getEntityMentionSetListSize(); i += kbest) {
            EntityMentionSet entityMentionSet = (EntityMentionSet) communication.getEntityMentionSetList().get(i);
            Iterator it3 = ((EntityMentionSet) communication2.getEntityMentionSetList().get(i)).getMentionList().iterator();
            while (it3.hasNext()) {
                entityMentionSet.addToMentionList((EntityMention) it3.next());
            }
        }
    }

    public static Communication mergeCommunicationsAsSections(List<Communication> list) {
        Communication communication = list.get(0);
        for (int i = kbest; i < list.size(); i += kbest) {
            Communication communication2 = list.get(i);
            if (!communication.getId().equals(communication2.getId())) {
                throw new IllegalArgumentException("not all ids match, these should be sections from the same document and have the same id");
            }
            mergeInto(communication, communication2);
        }
        return communication;
    }

    public Stream<Stream<Conll2011Document>> preIngest() throws IOException {
        return Files.list(this.ingestPath).filter(this.keep).map(this::readDocuments).map(list -> {
            return list.stream();
        });
    }

    private List<Conll2011Document> readDocuments(Path path) {
        LOGGER.debug("reading from {}", path.toString());
        try {
            List<String> list = (List) Files.lines(path, StandardCharsets.UTF_8).collect(Collectors.toList());
            ArrayList arrayList = new ArrayList();
            int i = 0;
            while (i < list.size()) {
                if (i < 0) {
                    throw new RuntimeException();
                }
                i = readDocument(path, list, i, arrayList);
            }
            return arrayList;
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    private int readDocument(Path path, List<String> list, int i, List<Conll2011Document> list2) {
        String str = list.get(i);
        Matcher matcher = p.matcher(str);
        matcher.find();
        if (!matcher.matches()) {
            LOGGER.warn("prev=" + list.get(i - kbest));
            LOGGER.warn("head=" + str);
            LOGGER.warn("next=" + list.get(i + kbest));
            throw new RuntimeException();
        }
        String group = matcher.group(kbest);
        int i2 = 0;
        Iterator<Conll2011Document> it = list2.iterator();
        while (it.hasNext()) {
            i2 += it.next().getSentences().size();
        }
        Conll2011Document conll2011Document = new Conll2011Document(this, group, "???");
        list2.add(conll2011Document);
        ArrayList arrayList = new ArrayList();
        int i3 = i + kbest;
        while (true) {
            int i4 = i3;
            if (i4 >= list.size()) {
                return -1;
            }
            if (list.get(i4).startsWith("#end document")) {
                Iterator<Conll2011Sentence> it2 = arrayList.iterator();
                while (it2.hasNext()) {
                    conll2011Document.add(it2.next());
                }
                return i4 + kbest;
            }
            int i5 = i2;
            i2 += kbest;
            i3 = readSentence(path, list, i4, i5, arrayList);
        }
    }

    private int readSentence(Path path, List<String> list, int i, int i2, List<Conll2011Sentence> list2) {
        Conll2011Sentence conll2011Sentence = new Conll2011Sentence(this, i2);
        list2.add(conll2011Sentence);
        for (int i3 = i; i3 < list.size(); i3 += kbest) {
            String str = list.get(i3);
            if (str.isEmpty()) {
                return i3 + kbest;
            }
            conll2011Sentence.add(new Conll2011Row(str));
        }
        if (!this.includeDebugInfo) {
            return -1;
        }
        conll2011Sentence.debugInfo = new Conll2011Sentence.DebugInfo(path, i, i + conll2011Sentence.size());
        return -1;
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length != 3) {
            System.err.println("please provide:");
            System.err.println("1) an input directory of CoNLL data");
            System.err.println("2) an output Concrete tar gz file");
            System.err.println("3) suffix for the CoNLL files you're looking for (e.g. \".v4_gold_conll\")");
            return;
        }
        Path path = Paths.get(strArr[0], new String[0]);
        Path path2 = Paths.get(strArr[kbest], new String[0]);
        String str = strArr[2];
        if (Files.exists(path2, new LinkOption[0])) {
            throw new IllegalArgumentException("output must not exist (this tool won't overwrite): " + path2.toString());
        }
        System.out.println("reading from " + path.toString() + " looking for files that end in \"" + str + "\"");
        List list = (List) new Conll2011(path, path3 -> {
            return path3.endsWith(str);
        }).stream().collect(Collectors.toList());
        System.out.println("writing " + list.size() + " Communications to " + path2.toString());
        new TarGzCompactCommunicationSerializer().toTarGz(list, path2);
        System.out.println("done");
    }

    public String getKind() {
        return "document";
    }

    public long getTimestamp() {
        return Timing.currentLocalTime();
    }

    public String getTool() {
        return Conll2011.class.getSimpleName();
    }

    public String getToolVersion() {
        return "4.8.6";
    }

    public List<String> getToolNotes() {
        return new ArrayList();
    }

    public Stream<Communication> stream() throws IngestException {
        try {
            return preIngest().map(stream -> {
                return (List) stream.map(conll2011Document -> {
                    return conll2011Document.convertToConcrete();
                }).collect(Collectors.toList());
            }).map(Conll2011::mergeCommunicationsAsSections).map(Conll2011::projectTokenTextSpansUpwards);
        } catch (IOException e) {
            throw new IngestException(e);
        }
    }

    public static Communication projectTokenTextSpansUpwards(Communication communication) {
        if (communication.isSetText()) {
            throw new IllegalArgumentException("text is already set");
        }
        Communication communication2 = new Communication(communication);
        StringBuilder sb = new StringBuilder();
        for (Section section : communication2.getSectionList()) {
            int length = sb.length();
            for (Sentence sentence : section.getSentenceList()) {
                Tokenization tokenization = sentence.getTokenization();
                if (!TokenizationKind.TOKEN_LIST.equals(tokenization.getKind())) {
                    throw new IllegalArgumentException("only token lists are supported");
                }
                int length2 = sb.length();
                List tokenList = tokenization.getTokenList().getTokenList();
                for (int i = 0; i < tokenList.size(); i += kbest) {
                    if (i > 0) {
                        sb.append(' ');
                    }
                    Token token = (Token) tokenList.get(i);
                    if (!token.isSetText()) {
                        throw new IllegalArgumentException("Token text is not set!");
                    }
                    int length3 = sb.length();
                    sb.append(token.getText());
                    token.setTextSpan(new TextSpan(length3, sb.length()));
                }
                int length4 = sb.length();
                if (sentence.isSetTextSpan()) {
                    boolean z = length2 == sentence.getTextSpan().getStart();
                    boolean z2 = length4 == sentence.getTextSpan().getEnding();
                    if (!z || !z2) {
                        throw new RuntimeException("incompatible existing Sentence.textSpan! existingStart=" + sentence.getTextSpan().getStart() + " existingEnd=" + sentence.getTextSpan().getEnding() + " computedStart=" + length2 + " computedEnd=" + length4);
                    }
                } else {
                    sentence.setTextSpan(new TextSpan(length2, length4));
                }
                sb.append('\n');
            }
            int length5 = sb.length();
            if (section.isSetTextSpan()) {
                boolean z3 = length == section.getTextSpan().getStart();
                boolean z4 = length5 == section.getTextSpan().getEnding();
                if (!z3 || !z4) {
                    throw new RuntimeException("incompatible existing Sentence.textSpan! existingStart=" + section.getTextSpan().getStart() + " existingEnd=" + section.getTextSpan().getEnding() + " computedStart=" + length + " computedEnd=" + length5);
                }
            } else {
                section.setTextSpan(new TextSpan(length, length5));
            }
            sb.append('\n');
        }
        communication2.setText(sb.toString());
        return communication2;
    }
}
