package io.annot8.components.documents.processors;

import com.drew.metadata.Metadata;
import com.drew.metadata.xmp.XmpReader;
import io.annot8.api.components.annotations.ComponentDescription;
import io.annot8.api.components.annotations.ComponentName;
import io.annot8.api.components.annotations.ComponentTags;
import io.annot8.api.components.annotations.SettingsClass;
import io.annot8.api.context.Context;
import io.annot8.api.exceptions.ProcessingException;
import io.annot8.common.data.content.FileContent;
import io.annot8.common.data.content.InputStreamContent;
import io.annot8.components.documents.data.ExtractionWithProperties;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.slf4j.Logger;

@ComponentDescription("Extracts image and text from PDF (*.pdf) files")
@ComponentTags({"documents", "pdf", "extractor", "text", "images", "metadata"})
@ComponentName("PDF Extractor")
@SettingsClass(DocumentExtractorSettings.class)
/* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor.class */
public class PdfExtractor extends AbstractDocumentExtractorDescriptor<Processor> {

    /* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor$Processor.class */
    public static class Processor extends AbstractDocumentExtractorProcessor<PDDocument> {
        private final Logger logger;

        /* JADX INFO: Access modifiers changed from: private */
        /* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor$Processor$ImageExtractor.class */
        public static class ImageExtractor extends PDFStreamEngine {
            private int imageNumber = 0;
            private final List<ExtractionWithProperties<BufferedImage>> extractedImages = new ArrayList();
            private int pageNumber = -1;

            private ImageExtractor() {
            }

            protected void setPageNumber(int i) {
                this.pageNumber = i;
            }

            protected List<ExtractionWithProperties<BufferedImage>> getExtractedImages() {
                return this.extractedImages;
            }

            protected void processOperator(Operator operator, List<COSBase> list) throws IOException {
                if (!"Do".equals(operator.getName())) {
                    super.processOperator(operator, list);
                    return;
                }
                PDImageXObject xObject = getResources().getXObject(list.get(0));
                if (!(xObject instanceof PDImageXObject)) {
                    if (xObject instanceof PDFormXObject) {
                        showForm((PDFormXObject) xObject);
                        return;
                    }
                    return;
                }
                HashMap hashMap = new HashMap();
                PDImageXObject pDImageXObject = xObject;
                PDMetadata metadata = pDImageXObject.getMetadata();
                if (metadata != null) {
                    Metadata metadata2 = new Metadata();
                    new XmpReader().extract(metadata.toByteArray(), metadata2);
                    hashMap.putAll(AbstractDocumentExtractorProcessor.toMap(metadata2));
                }
                this.imageNumber++;
                hashMap.put("index", Integer.valueOf(this.imageNumber));
                hashMap.put("page", Integer.valueOf(this.pageNumber));
                this.extractedImages.add(new ExtractionWithProperties<>(pDImageXObject.getImage(), hashMap));
            }
        }

        public Processor(Context context, DocumentExtractorSettings documentExtractorSettings) {
            super(context, documentExtractorSettings);
            this.logger = getLogger();
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isMetadataSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isTextSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isImagesSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptFile(FileContent fileContent) {
            return ((File) fileContent.getData()).getName().toLowerCase().endsWith(".pdf");
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptInputStream(InputStreamContent inputStreamContent) {
            try {
                InputStream inputStream = (InputStream) inputStreamContent.getData();
                try {
                    boolean z = FileMagic.valueOf(new BufferedInputStream(inputStream)) == FileMagic.PDF;
                    if (inputStream != null) {
                        inputStream.close();
                    }
                    return z;
                } catch (Throwable th) {
                    if (inputStream != null) {
                        try {
                            inputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    }
                    throw th;
                }
            } catch (IOException e) {
                return false;
            }
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public PDDocument extractDocument(FileContent fileContent) throws IOException {
            return PDDocument.load((File) fileContent.getData());
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public PDDocument extractDocument(InputStreamContent inputStreamContent) throws IOException {
            return PDDocument.load((InputStream) inputStreamContent.getData());
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Map<String, Object> extractMetadata(PDDocument pDDocument) {
            HashMap hashMap = new HashMap();
            PDDocumentInformation documentInformation = pDDocument.getDocumentInformation();
            hashMap.put(DocumentProperties.AUTHOR, documentInformation.getAuthor());
            hashMap.put(DocumentProperties.CREATION_DATE, toTemporal(documentInformation.getCreationDate()));
            hashMap.put(DocumentProperties.CREATOR, documentInformation.getCreator());
            hashMap.put(DocumentProperties.KEYWORDS, documentInformation.getKeywords());
            hashMap.put(DocumentProperties.LAST_MODIFIED_DATE, toTemporal(documentInformation.getModificationDate()));
            hashMap.put(DocumentProperties.PRODUCER, documentInformation.getProducer());
            hashMap.put(DocumentProperties.SUBJECT, documentInformation.getSubject());
            hashMap.put("title", documentInformation.getTitle());
            for (String str : documentInformation.getMetadataKeys()) {
                hashMap.put("custom." + str, documentInformation.getCustomMetadataValue(str));
            }
            hashMap.put(DocumentProperties.PAGE_COUNT, Integer.valueOf(pDDocument.getNumberOfPages()));
            return hashMap;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<String>> extractText(PDDocument pDDocument) throws ProcessingException {
            try {
                return List.of(new ExtractionWithProperties(new PDFTextStripper().getText(pDDocument)));
            } catch (IOException e) {
                throw new ProcessingException("Unable to extract text from PDF", e);
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<BufferedImage>> extractImages(PDDocument pDDocument) {
            ImageExtractor imageExtractor = new ImageExtractor();
            int i = 0;
            Iterator it = pDDocument.getPages().iterator();
            while (it.hasNext()) {
                PDPage pDPage = (PDPage) it.next();
                i++;
                try {
                    imageExtractor.setPageNumber(i);
                    imageExtractor.processPage(pDPage);
                } catch (IOException e) {
                    this.logger.warn("Unable to extract images from page {} of PDF", Integer.valueOf(i), e);
                }
            }
            return imageExtractor.getExtractedImages();
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Processor createComponent(Context context, DocumentExtractorSettings documentExtractorSettings) {
        return new Processor(context, documentExtractorSettings);
    }
}
