package org.apache.lucene.benchmark.byTask.feeds;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;

/* loaded from: input_file:org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.class */
public class TrecContentSource extends ContentSource {
    public static final String DOCNO = "<DOCNO>";
    public static final String TERMINATING_DOCNO = "</DOCNO>";
    public static final String DOC = "<DOC>";
    public static final String TERMINATING_DOC = "</DOC>";
    public static final String NEW_LINE = System.getProperty("line.separator");
    private static final String[] DATE_FORMATS = {"EEE, dd MMM yyyy kk:mm:ss z", "EEE MMM dd kk:mm:ss yyyy z", "EEE, dd-MMM-':'y kk:mm:ss z", "EEE, dd-MMM-yyy kk:mm:ss z", "EEE MMM dd kk:mm:ss yyyy", "dd MMM yyyy", "MMM dd, yyyy", "yyMMdd", "hhmm z.z.z. MMM dd, yyyy"};
    BufferedReader reader;
    HTMLParser htmlParser;
    private boolean excludeDocnameIteration;
    TrecDocParser.ParsePathType currPathType;
    private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<>();
    private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<>();
    private File dataDir = null;
    private ArrayList<File> inputFiles = new ArrayList<>();
    private int nextFile = 0;
    private Object lock = new Object();
    int iteration = 0;
    private TrecDocParser trecDocParser = new TrecGov2Parser();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/apache/lucene/benchmark/byTask/feeds/TrecContentSource$DateFormatInfo.class */
    public static final class DateFormatInfo {
        DateFormat[] dfs;
        ParsePosition pos;

        DateFormatInfo() {
        }
    }

    private DateFormatInfo getDateFormatInfo() {
        DateFormatInfo dateFormatInfo = this.dateFormats.get();
        if (dateFormatInfo == null) {
            dateFormatInfo = new DateFormatInfo();
            dateFormatInfo.dfs = new SimpleDateFormat[DATE_FORMATS.length];
            for (int i = 0; i < dateFormatInfo.dfs.length; i++) {
                dateFormatInfo.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.ROOT);
                dateFormatInfo.dfs[i].setLenient(true);
            }
            dateFormatInfo.pos = new ParsePosition(0);
            this.dateFormats.set(dateFormatInfo);
        }
        return dateFormatInfo;
    }

    private StringBuilder getDocBuffer() {
        StringBuilder sb = this.trecDocBuffer.get();
        if (sb == null) {
            sb = new StringBuilder();
            this.trecDocBuffer.set(sb);
        }
        return sb;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public HTMLParser getHtmlParser() {
        return this.htmlParser;
    }

    private void read(StringBuilder sb, String str, boolean z, boolean z2) throws IOException, NoMoreDataException {
        String readLine;
        String str2 = "";
        while (true) {
            readLine = this.reader.readLine();
            if (readLine != null) {
                readLine.length();
                if (str != null && readLine.startsWith(str)) {
                    break;
                } else if (z2) {
                    sb.append(str2).append(readLine);
                    str2 = NEW_LINE;
                }
            } else {
                openNextFile();
            }
        }
        if (z) {
            sb.append(str2).append(readLine);
            String str3 = NEW_LINE;
        }
    }

    void openNextFile() throws NoMoreDataException, IOException {
        close();
        this.currPathType = null;
        while (true) {
            if (this.nextFile >= this.inputFiles.size()) {
                if (!this.forever) {
                    throw new NoMoreDataException();
                }
                this.nextFile = 0;
                this.iteration++;
            }
            ArrayList<File> arrayList = this.inputFiles;
            int i = this.nextFile;
            this.nextFile = i + 1;
            File file = arrayList.get(i);
            if (this.verbose) {
                System.out.println("opening: " + file + " length: " + file.length());
            }
            try {
                this.reader = new BufferedReader(new InputStreamReader(StreamUtils.inputStream(file), this.encoding), StreamUtils.BUFFER_SIZE);
                this.currPathType = TrecDocParser.pathType(file);
                return;
            } catch (Exception e) {
                if (!this.verbose) {
                    throw new NoMoreDataException();
                }
                System.out.println("Skipping 'bad' file " + file.getAbsolutePath() + " due to " + e.getMessage());
            }
        }
    }

    public Date parseDate(String str) {
        String trim = str.trim();
        DateFormatInfo dateFormatInfo = getDateFormatInfo();
        for (int i = 0; i < dateFormatInfo.dfs.length; i++) {
            DateFormat dateFormat = dateFormatInfo.dfs[i];
            dateFormatInfo.pos.setIndex(0);
            dateFormatInfo.pos.setErrorIndex(-1);
            Date parse = dateFormat.parse(trim, dateFormatInfo.pos);
            if (parse != null) {
                return parse;
            }
        }
        if (!this.verbose) {
            return null;
        }
        System.out.println("failed to parse date (assigning 'now') for: " + trim);
        return null;
    }

    @Override // org.apache.lucene.benchmark.byTask.feeds.ContentItemsSource, java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
        if (this.reader == null) {
            return;
        }
        try {
            this.reader.close();
        } catch (IOException e) {
            if (this.verbose) {
                System.out.println("failed to close reader !");
                e.printStackTrace(System.out);
            }
        }
        this.reader = null;
    }

    @Override // org.apache.lucene.benchmark.byTask.feeds.ContentSource
    public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
        TrecDocParser.ParsePathType parsePathType;
        String trim;
        StringBuilder docBuffer = getDocBuffer();
        synchronized (this.lock) {
            if (this.reader == null) {
                openNextFile();
            }
            docBuffer.setLength(0);
            read(docBuffer, DOC, false, false);
            parsePathType = this.currPathType;
            docBuffer.setLength(0);
            read(docBuffer, DOCNO, true, false);
            trim = docBuffer.substring(DOCNO.length(), docBuffer.indexOf(TERMINATING_DOCNO, DOCNO.length())).trim();
            if (!this.excludeDocnameIteration) {
                trim = trim + "_" + this.iteration;
            }
            docBuffer.setLength(0);
            read(docBuffer, TERMINATING_DOC, false, true);
        }
        addBytes(docBuffer.length());
        DocData parse = this.trecDocParser.parse(docData, trim, this, docBuffer, parsePathType);
        addItem();
        return parse;
    }

    @Override // org.apache.lucene.benchmark.byTask.feeds.ContentItemsSource
    public void resetInputs() throws IOException {
        synchronized (this.lock) {
            super.resetInputs();
            close();
            this.nextFile = 0;
            this.iteration = 0;
        }
    }

    @Override // org.apache.lucene.benchmark.byTask.feeds.ContentItemsSource
    public void setConfig(Config config) {
        super.setConfig(config);
        File file = new File(config.get("work.dir", "work"));
        String str = config.get("docs.dir", "trec");
        this.dataDir = new File(str);
        if (!this.dataDir.isAbsolute()) {
            this.dataDir = new File(file, str);
        }
        collectFiles(this.dataDir, this.inputFiles);
        if (this.inputFiles.size() == 0) {
            throw new IllegalArgumentException("No files in dataDir: " + this.dataDir);
        }
        try {
            this.trecDocParser = (TrecDocParser) Class.forName(config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser")).asSubclass(TrecDocParser.class).newInstance();
            try {
                this.htmlParser = (HTMLParser) Class.forName(config.get("html.parser", "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).asSubclass(HTMLParser.class).newInstance();
                if (this.encoding == null) {
                    this.encoding = StandardCharsets.ISO_8859_1.name();
                }
                this.excludeDocnameIteration = config.get("content.source.excludeIteration", false);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        } catch (Exception e2) {
            throw new RuntimeException(e2);
        }
    }
}
