public abstract class SparkJob
extends org.openstreetmap.atlas.utilities.runtime.Command
implements java.io.Serializable
org.openstreetmap.atlas.utilities.runtime.Command.Flag, org.openstreetmap.atlas.utilities.runtime.Command.Optionality, org.openstreetmap.atlas.utilities.runtime.Command.Switch<T>, org.openstreetmap.atlas.utilities.runtime.Command.SwitchList| Modifier and Type | Field and Description |
|---|---|
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.util.Map<java.lang.String,java.lang.String>> |
ADDITIONAL_SPARK_OPTIONS |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> |
COMPRESS_OUTPUT |
static java.lang.String |
EXITED_FILE |
static java.lang.String |
FAILED_FILE |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> |
INPUT |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> |
MASTER |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> |
OUTPUT |
static java.lang.String |
SAVING_SEPARATOR |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.util.regex.Pattern> |
SENSITIVE_CONFIGURATION_PATTERN |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<SparkContextProvider> |
SPARK_CONTEXT_PROVIDER |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.util.Map<java.lang.String,java.lang.String>> |
SPARK_OPTIONS |
static java.lang.String |
STARTED_FILE |
static org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> |
STARTED_FOLDER |
static java.lang.String |
SUCCESS_FILE |
| Constructor and Description |
|---|
SparkJob() |
| Modifier and Type | Method and Description |
|---|---|
protected org.apache.hadoop.conf.Configuration |
configuration() |
protected java.util.Map<java.lang.String,java.lang.String> |
configurationMap() |
protected java.lang.String |
getAlternateParallelFolderOutput(java.lang.String output,
java.lang.String name)
Get an alternate output based on the main output folder used for monitoring
|
protected java.lang.String |
getAlternateSubFolderOutput(java.lang.String output,
java.lang.String name)
Get an alternate output based on the main output folder used for monitoring.
|
protected org.apache.spark.api.java.JavaSparkContext |
getContext() |
abstract java.lang.String |
getName() |
protected java.lang.String |
input(org.openstreetmap.atlas.utilities.runtime.CommandMap command) |
int |
onRun(org.openstreetmap.atlas.utilities.runtime.CommandMap command) |
protected java.lang.String |
output(org.openstreetmap.atlas.utilities.runtime.CommandMap command) |
protected java.util.List<java.lang.String> |
outputToClean(org.openstreetmap.atlas.utilities.runtime.CommandMap command)
Define all the folders to clean before a run.
|
protected org.openstreetmap.atlas.streaming.resource.Resource |
resource(java.lang.String path) |
static org.openstreetmap.atlas.streaming.resource.Resource |
resource(java.lang.String path,
java.util.Map<java.lang.String,java.lang.String> configurationMap) |
protected void |
setContext(org.apache.spark.api.java.JavaSparkContext context) |
protected <T> void |
splitAndSaveAsHadoopFile(org.apache.spark.api.java.JavaPairRDD<java.lang.String,T> input,
java.lang.String path,
java.lang.Class<T> valueClass,
java.lang.Class<? extends org.apache.hadoop.mapred.lib.MultipleOutputFormat<java.lang.String,T>> formatterClass,
java.util.function.Function<java.lang.String,java.lang.String> keyReducer)
Instead of saving a full RDD(String, T) in a single folder, this function allows to save
subsets of an RDD(String, T) in separate folders.
|
abstract void |
start(org.openstreetmap.atlas.utilities.runtime.CommandMap command)
The spark Job
|
protected org.openstreetmap.atlas.utilities.runtime.Command.SwitchList |
switches() |
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> INPUT
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> OUTPUT
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> STARTED_FOLDER
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> MASTER
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.util.Map<java.lang.String,java.lang.String>> SPARK_OPTIONS
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.util.Map<java.lang.String,java.lang.String>> ADDITIONAL_SPARK_OPTIONS
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.lang.String> COMPRESS_OUTPUT
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<SparkContextProvider> SPARK_CONTEXT_PROVIDER
public static final org.openstreetmap.atlas.utilities.runtime.Command.Switch<java.util.regex.Pattern> SENSITIVE_CONFIGURATION_PATTERN
public static final java.lang.String SUCCESS_FILE
public static final java.lang.String STARTED_FILE
public static final java.lang.String EXITED_FILE
public static final java.lang.String FAILED_FILE
public static final java.lang.String SAVING_SEPARATOR
public static org.openstreetmap.atlas.streaming.resource.Resource resource(java.lang.String path,
java.util.Map<java.lang.String,java.lang.String> configurationMap)
public abstract java.lang.String getName()
public int onRun(org.openstreetmap.atlas.utilities.runtime.CommandMap command)
onRun in class org.openstreetmap.atlas.utilities.runtime.Commandpublic abstract void start(org.openstreetmap.atlas.utilities.runtime.CommandMap command)
command - The arguments passed to the main methodprotected org.apache.hadoop.conf.Configuration configuration()
protected java.util.Map<java.lang.String,java.lang.String> configurationMap()
protected java.lang.String getAlternateParallelFolderOutput(java.lang.String output,
java.lang.String name)
output - The main output foldername - The name of the alternate folderprotected java.lang.String getAlternateSubFolderOutput(java.lang.String output,
java.lang.String name)
output - The main output foldername - The name of the alternate folderprotected org.apache.spark.api.java.JavaSparkContext getContext()
protected java.lang.String input(org.openstreetmap.atlas.utilities.runtime.CommandMap command)
protected java.lang.String output(org.openstreetmap.atlas.utilities.runtime.CommandMap command)
protected java.util.List<java.lang.String> outputToClean(org.openstreetmap.atlas.utilities.runtime.CommandMap command)
command - The command parameters sent to the main class.protected org.openstreetmap.atlas.streaming.resource.Resource resource(java.lang.String path)
path - The path to open (in an URL format)protected void setContext(org.apache.spark.api.java.JavaSparkContext context)
protected <T> void splitAndSaveAsHadoopFile(org.apache.spark.api.java.JavaPairRDD<java.lang.String,T> input,
java.lang.String path,
java.lang.Class<T> valueClass,
java.lang.Class<? extends org.apache.hadoop.mapred.lib.MultipleOutputFormat<java.lang.String,T>> formatterClass,
java.util.function.Function<java.lang.String,java.lang.String> keyReducer)
This function might be slow as it will generate a Spark stage for each category in this RDD. In the example above, it would create two stages. When the number of stages increases, it might be really slow.
T - The type of the object to saveinput - The RDD to savepath - The output path of the jobvalueClass - The type to save as Hadoop fileformatterClass - The corresponding Hadoop formatterkeyReducer - The key reducing function explained above.protected org.openstreetmap.atlas.utilities.runtime.Command.SwitchList switches()
switches in class org.openstreetmap.atlas.utilities.runtime.Command