package ai.tripl.arc.extract;

import ai.tripl.arc.api.API;
import ai.tripl.arc.util.CloudUtils$;
import ai.tripl.arc.util.ExtractUtils$;
import ai.tripl.arc.util.MetadataUtils$;
import ai.tripl.arc.util.log.logger.Logger;
import java.util.HashMap;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.storage.StorageLevel$;
import scala.MatchError;
import scala.None$;
import scala.Option;
import scala.Option$;
import scala.Predef$;
import scala.Some;
import scala.StringContext;
import scala.collection.immutable.Nil$;
import scala.reflect.ClassTag$;
import scala.runtime.BoxedUnit;
import scala.runtime.BoxesRunTime;

/* compiled from: TextExtract.scala */
/* loaded from: input_file:ai/tripl/arc/extract/TextExtract$.class */
public final class TextExtract$ {
    public static final TextExtract$ MODULE$ = null;

    static {
        new TextExtract$();
    }

    public Option<Dataset<Row>> extract(API.TextExtract textExtract, SparkSession sparkSession, Logger logger, API.ARCContext aRCContext) {
        Dataset<Row> emptyDataFrame;
        Dataset<Row> df;
        Dataset<Row> dataset;
        Dataset<Row> dataset2;
        Dataset<Row> dataset3;
        long currentTimeMillis = System.currentTimeMillis();
        HashMap hashMap = new HashMap();
        hashMap.put("type", textExtract.getType());
        hashMap.put("name", textExtract.name());
        textExtract.description().foreach(new TextExtract$$anonfun$extract$1(hashMap));
        hashMap.put("input", textExtract.input());
        hashMap.put("outputView", textExtract.outputView());
        hashMap.put("persist", Boolean.valueOf(textExtract.persist()));
        hashMap.put("contiguousIndex", Boolean.valueOf(textExtract.contiguousIndex()));
        hashMap.put("multiLine", Boolean.valueOf(textExtract.multiLine()));
        logger.info().field("event", "enter").map("stage", hashMap).log();
        try {
            Some schema = ExtractUtils$.MODULE$.getSchema(textExtract.cols(), sparkSession, logger);
            try {
                if (aRCContext.isStreaming()) {
                    CloudUtils$.MODULE$.setHadoopConfiguration(textExtract.authentication(), sparkSession, logger);
                    if (!(schema instanceof Some)) {
                        if (None$.MODULE$.equals(schema)) {
                            throw new Exception("JSONExtract requires 'schemaURI' to be set if Arc is running in streaming mode.");
                        }
                        throw new MatchError(schema);
                    }
                    emptyDataFrame = sparkSession.readStream().schema((StructType) schema.x()).text(textExtract.input());
                } else {
                    CloudUtils$.MODULE$.setHadoopConfiguration(textExtract.authentication(), sparkSession, logger);
                    try {
                        if (textExtract.multiLine()) {
                            Some basePath = textExtract.basePath();
                            if (basePath instanceof Some) {
                                df = sparkSession.read().option("mergeSchema", "true").option("basePath", (String) basePath.x()).parquet(textExtract.input());
                            } else {
                                if (!None$.MODULE$.equals(basePath)) {
                                    throw new MatchError(basePath);
                                }
                                df = sparkSession.read().option("wholetext", "true").textFile(textExtract.input()).toDF();
                            }
                            emptyDataFrame = df;
                        } else {
                            emptyDataFrame = sparkSession.read().option("wholetext", "false").textFile(textExtract.input()).toDF();
                        }
                    } catch (Throwable th) {
                        if (!(th instanceof AnalysisException) || !th.getMessage().contains("Path does not exist")) {
                            if (th instanceof Exception) {
                                throw ((Exception) th);
                            }
                            throw th;
                        }
                        emptyDataFrame = sparkSession.emptyDataFrame();
                    }
                }
                Dataset<Row> dataset4 = emptyDataFrame;
                try {
                    if (dataset4.schema().length() == 0) {
                        hashMap.put("records", 0);
                        if (!(schema instanceof Some)) {
                            if (None$.MODULE$.equals(schema)) {
                                throw new Exception(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"TextExtract has produced 0 columns and no schema has been provided to create an empty dataframe."})).s(Nil$.MODULE$));
                            }
                            throw new MatchError(schema);
                        }
                        dataset = sparkSession.createDataFrame(sparkSession.sparkContext().emptyRDD(ClassTag$.MODULE$.apply(Row.class)), (StructType) schema.x());
                    } else {
                        dataset = dataset4;
                    }
                    Dataset<Row> addInternalColumns = ExtractUtils$.MODULE$.addInternalColumns(dataset, textExtract.contiguousIndex());
                    if (schema instanceof Some) {
                        dataset2 = MetadataUtils$.MODULE$.setMetadata(addInternalColumns, (StructType) schema.x());
                    } else {
                        if (!None$.MODULE$.equals(schema)) {
                            throw new MatchError(schema);
                        }
                        dataset2 = addInternalColumns;
                    }
                    Dataset<Row> dataset5 = dataset2;
                    Some numPartitions = textExtract.numPartitions();
                    if (numPartitions instanceof Some) {
                        dataset3 = dataset5.repartition(BoxesRunTime.unboxToInt(numPartitions.x()));
                    } else {
                        if (!None$.MODULE$.equals(numPartitions)) {
                            throw new MatchError(numPartitions);
                        }
                        dataset3 = dataset5;
                    }
                    Dataset<Row> dataset6 = dataset3;
                    dataset6.createOrReplaceTempView(textExtract.outputView());
                    if (dataset6.isStreaming()) {
                        BoxedUnit boxedUnit = BoxedUnit.UNIT;
                    } else {
                        hashMap.put("inputFiles", Integer.valueOf(dataset6.inputFiles().length));
                        hashMap.put("outputColumns", Integer.valueOf(dataset6.schema().length()));
                        hashMap.put("numPartitions", Integer.valueOf(dataset6.rdd().partitions().length));
                        if (textExtract.persist()) {
                            dataset6.persist(StorageLevel$.MODULE$.MEMORY_AND_DISK_SER());
                            hashMap.put("records", Long.valueOf(dataset6.count()));
                        } else {
                            BoxedUnit boxedUnit2 = BoxedUnit.UNIT;
                        }
                    }
                    logger.info().field("event", "exit").field("duration", BoxesRunTime.boxToLong(System.currentTimeMillis() - currentTimeMillis)).map("stage", hashMap).log();
                    return Option$.MODULE$.apply(dataset6);
                } catch (Exception e) {
                    throw new TextExtract$$anon$3(hashMap, e);
                }
            } catch (Exception e2) {
                throw new TextExtract$$anon$2(hashMap, e2);
            }
        } catch (Exception e3) {
            throw new TextExtract$$anon$1(hashMap, e3);
        }
    }

    private TextExtract$() {
        MODULE$ = this;
    }
}
