package io.annot8.components.documents.processors;

import com.drew.imaging.ImageMetadataReader;
import com.drew.imaging.ImageProcessingException;
import io.annot8.api.components.annotations.ComponentDescription;
import io.annot8.api.components.annotations.ComponentName;
import io.annot8.api.components.annotations.ComponentTags;
import io.annot8.api.components.annotations.SettingsClass;
import io.annot8.api.context.Context;
import io.annot8.common.data.content.FileContent;
import io.annot8.common.data.content.InputStreamContent;
import io.annot8.components.documents.data.ExtractionWithProperties;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.util.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;

@ComponentDescription("Extracts image and text from HTML (*.html) files")
@ComponentTags({"documents", "html", "extractor", "text", "images", "metadata"})
@ComponentName("HTML Extractor")
@SettingsClass(DocumentExtractorSettings.class)
/* loaded from: input_file:io/annot8/components/documents/processors/HtmlExtractor.class */
public class HtmlExtractor extends AbstractDocumentExtractorDescriptor<Processor> {

    /* loaded from: input_file:io/annot8/components/documents/processors/HtmlExtractor$Processor.class */
    public static class Processor extends AbstractDocumentExtractorProcessor<Document> {
        private final Logger logger;

        public Processor(Context context, DocumentExtractorSettings documentExtractorSettings) {
            super(context, documentExtractorSettings);
            this.logger = getLogger();
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isMetadataSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isTextSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isImagesSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptFile(FileContent fileContent) {
            return ((File) fileContent.getData()).getName().toLowerCase().endsWith(".htm") || ((File) fileContent.getData()).getName().toLowerCase().endsWith(".html");
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptInputStream(InputStreamContent inputStreamContent) {
            try {
                return FileMagic.HTML == FileMagic.valueOf(new BufferedInputStream((InputStream) inputStreamContent.getData()));
            } catch (IOException e) {
                return false;
            }
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Document extractDocument(FileContent fileContent) throws IOException {
            return Jsoup.parse((File) fileContent.getData(), StandardCharsets.UTF_8.name());
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Document extractDocument(InputStreamContent inputStreamContent) throws IOException {
            return Jsoup.parse((InputStream) inputStreamContent.getData(), StandardCharsets.UTF_8.name(), "");
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Map<String, Object> extractMetadata(Document document) {
            HashMap hashMap = new HashMap();
            hashMap.put("title", document.title());
            hashMap.put("language", document.select("html").first().attr("lang"));
            document.getElementsByTag("meta").forEach(element -> {
                String attr = element.attr("name");
                if (attr.isBlank() && element.hasAttr("http-equiv")) {
                    attr = "http/" + element.attr("http-equiv");
                }
                if (attr.isBlank()) {
                    if (element.hasAttr("charset")) {
                        hashMap.put("charset", element.attr("charset"));
                        return;
                    }
                    return;
                }
                String attr2 = element.attr("content");
                String str = attr;
                boolean z = -1;
                switch (str.hashCode()) {
                    case -1724546052:
                        if (str.equals("description")) {
                            z = 3;
                            break;
                        }
                        break;
                    case -1406328437:
                        if (str.equals(DocumentProperties.AUTHOR)) {
                            z = true;
                            break;
                        }
                        break;
                    case -45586200:
                        if (str.equals("application-name")) {
                            z = false;
                            break;
                        }
                        break;
                    case 286956243:
                        if (str.equals(DocumentProperties.GENERATOR)) {
                            z = 4;
                            break;
                        }
                        break;
                    case 523149226:
                        if (str.equals(DocumentProperties.KEYWORDS)) {
                            z = 5;
                            break;
                        }
                        break;
                    case 1028554796:
                        if (str.equals(DocumentProperties.CREATOR)) {
                            z = 2;
                            break;
                        }
                        break;
                    case 1447404028:
                        if (str.equals(DocumentProperties.PUBLISHER)) {
                            z = 6;
                            break;
                        }
                        break;
                }
                switch (z) {
                    case false:
                        addOrAppend((Map<String, Object>) hashMap, DocumentProperties.APPLICATION, attr2);
                        return;
                    case true:
                        addOrAppend((Map<String, Object>) hashMap, DocumentProperties.AUTHOR, attr2);
                        return;
                    case true:
                        addOrAppend((Map<String, Object>) hashMap, DocumentProperties.CREATOR, attr2);
                        return;
                    case true:
                        addOrAppend((Map<String, Object>) hashMap, "description", attr2);
                        return;
                    case true:
                        addOrAppend((Map<String, Object>) hashMap, DocumentProperties.GENERATOR, attr2);
                        return;
                    case true:
                        addOrAppend((Map<String, Object>) hashMap, DocumentProperties.KEYWORDS, Arrays.asList(attr2.split("\\s*,\\s*")));
                        return;
                    case true:
                        addOrAppend((Map<String, Object>) hashMap, DocumentProperties.PUBLISHER, attr2);
                        return;
                    default:
                        addOrAppend((Map<String, Object>) hashMap, attr, attr2);
                        return;
                }
            });
            return hashMap;
        }

        private void addOrAppend(Map<String, Object> map, String str, String str2) {
            if (!map.containsKey(str)) {
                map.put(str, str2);
                return;
            }
            Object obj = map.get(str);
            if (obj instanceof Collection) {
                Collection collection = (Collection) obj;
                collection.add(str2);
                map.put(str, collection);
            } else {
                ArrayList arrayList = new ArrayList();
                arrayList.add(obj);
                arrayList.add(str2);
                map.put(str, arrayList);
            }
        }

        private void addOrAppend(Map<String, Object> map, String str, Collection<String> collection) {
            if (!map.containsKey(str)) {
                map.put(str, collection);
                return;
            }
            Object obj = map.get(str);
            if (obj instanceof Collection) {
                Collection collection2 = (Collection) obj;
                collection2.addAll(collection);
                map.put(str, collection2);
            } else {
                ArrayList arrayList = new ArrayList();
                arrayList.add(obj);
                arrayList.addAll(collection);
                map.put(str, arrayList);
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<String>> extractText(Document document) {
            return List.of(new ExtractionWithProperties(document.text()));
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<BufferedImage>> extractImages(Document document) {
            byte[] decode;
            Element first;
            ArrayList arrayList = new ArrayList();
            int i = 0;
            Iterator it = document.getElementsByTag("img").iterator();
            while (it.hasNext()) {
                Element element = (Element) it.next();
                i++;
                String attr = element.attr("src");
                if (!attr.isBlank()) {
                    HashMap hashMap = new HashMap();
                    if (attr.startsWith("data:image/")) {
                        decode = Base64.getDecoder().decode(attr.split(",", 2)[1]);
                    } else {
                        try {
                            try {
                                InputStream openStream = new URL(attr).openStream();
                                try {
                                    decode = IOUtils.toByteArray(openStream);
                                    if (openStream != null) {
                                        openStream.close();
                                    }
                                    hashMap.put("name", attr.substring(attr.lastIndexOf(47) + 1));
                                } catch (Throwable th) {
                                    if (openStream != null) {
                                        try {
                                            openStream.close();
                                        } catch (Throwable th2) {
                                            th.addSuppressed(th2);
                                        }
                                    }
                                    throw th;
                                    break;
                                }
                            } catch (IOException e) {
                                this.logger.error("Unable to read image {} from URL", attr, e);
                            }
                        } catch (MalformedURLException e2) {
                            this.logger.error("Image source '" + attr + "' is not a valid URL", e2);
                        }
                    }
                    try {
                        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(decode);
                        try {
                            BufferedImage read = ImageIO.read(byteArrayInputStream);
                            byteArrayInputStream.close();
                            if (read == null) {
                                this.logger.warn("Null image {} extracted from document", attr);
                            } else {
                                try {
                                    ByteArrayInputStream byteArrayInputStream2 = new ByteArrayInputStream(decode);
                                    try {
                                        hashMap.putAll(toMap(ImageMetadataReader.readMetadata(byteArrayInputStream2)));
                                        byteArrayInputStream2.close();
                                    } catch (Throwable th3) {
                                        try {
                                            byteArrayInputStream2.close();
                                        } catch (Throwable th4) {
                                            th3.addSuppressed(th4);
                                        }
                                        throw th3;
                                        break;
                                    }
                                } catch (ImageProcessingException | IOException e3) {
                                    this.logger.warn("Unable to extract metadata from image {}", attr, e3);
                                }
                                hashMap.put("title", element.attr("title"));
                                hashMap.put("index", Integer.valueOf(i));
                                if ("figure".equals(element.parent().tagName().toLowerCase()) && (first = element.parent().getElementsByTag("figcaption").first()) != null) {
                                    hashMap.put("description", first.text());
                                }
                                element.attributes().forEach(attribute -> {
                                    if ("title".equalsIgnoreCase(attribute.getKey())) {
                                        return;
                                    }
                                    hashMap.put("html/" + attribute.getKey(), attribute.getValue());
                                });
                                arrayList.add(new ExtractionWithProperties(read, hashMap));
                            }
                        } catch (Throwable th5) {
                            try {
                                byteArrayInputStream.close();
                            } catch (Throwable th6) {
                                th5.addSuppressed(th6);
                            }
                            throw th5;
                            break;
                        }
                    } catch (Exception e4) {
                        this.logger.error("Unable to read image from {}", attr, e4);
                    }
                }
            }
            return arrayList;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Processor createComponent(Context context, DocumentExtractorSettings documentExtractorSettings) {
        return new Processor(context, documentExtractorSettings);
    }
}
