/*
 * Decompiled with CFR 0.152.
 */
package org.apache.spark.examples.ml;

import java.util.Arrays;
import java.util.List;
import org.apache.spark.ml.feature.RegexTokenizer;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.collection.mutable.WrappedArray;

public class JavaTokenizerExample {
    public static void main(String[] args) {
        SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();
        List<Row> data = Arrays.asList(RowFactory.create((Object[])new Object[]{0, "Hi I heard about Spark"}), RowFactory.create((Object[])new Object[]{1, "I wish Java could use case classes"}), RowFactory.create((Object[])new Object[]{2, "Logistic,regression,models,are,neat"}));
        StructType schema = new StructType(new StructField[]{new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty())});
        Dataset sentenceDataFrame = spark.createDataFrame(data, schema);
        Tokenizer tokenizer = (Tokenizer)((Tokenizer)new Tokenizer().setInputCol("sentence")).setOutputCol("words");
        RegexTokenizer regexTokenizer = ((RegexTokenizer)((RegexTokenizer)new RegexTokenizer().setInputCol("sentence")).setOutputCol("words")).setPattern("\\W");
        spark.udf().register("countTokens", (UDF1)new UDF1<WrappedArray, Integer>(){

            public Integer call(WrappedArray words) {
                return words.size();
            }
        }, DataTypes.IntegerType);
        Dataset tokenized = tokenizer.transform(sentenceDataFrame);
        tokenized.select("sentence", new String[]{"words"}).withColumn("tokens", functions.callUDF((String)"countTokens", (Column[])new Column[]{functions.col((String)"words")})).show(false);
        Dataset regexTokenized = regexTokenizer.transform(sentenceDataFrame);
        regexTokenized.select("sentence", new String[]{"words"}).withColumn("tokens", functions.callUDF((String)"countTokens", (Column[])new Column[]{functions.col((String)"words")})).show(false);
        spark.stop();
    }
}

