001/*
002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.tap.hadoop.util;
023
024import java.io.IOException;
025import java.net.URI;
026import java.util.HashMap;
027import java.util.Map;
028import java.util.concurrent.atomic.AtomicInteger;
029
030import cascading.flow.hadoop.util.HadoopUtil;
031import cascading.tap.Tap;
032import org.apache.hadoop.conf.Configuration;
033import org.apache.hadoop.fs.FileStatus;
034import org.apache.hadoop.fs.FileSystem;
035import org.apache.hadoop.fs.Path;
036import org.apache.hadoop.mapred.FileOutputFormat;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance;
041
042public class Hadoop18TapUtil
043  {
044  /** The Hadoop temporary path used to prevent collisions */
045  public static final String TEMPORARY_PATH = "_temporary";
046  /** Field LOG */
047  private static final Logger LOG = LoggerFactory.getLogger( Hadoop18TapUtil.class );
048  private static final Map<String, AtomicInteger> pathCounts = new HashMap<String, AtomicInteger>();
049
050  /**
051   * should only be called if not in a Flow
052   *
053   * @param conf
054   * @throws IOException
055   */
056  public static void setupJob( Configuration conf ) throws IOException
057    {
058    Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) );
059
060    if( outputPath == null )
061      return;
062
063    if( getFSSafe( conf, outputPath ) == null )
064      return;
065
066    String taskID = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
067
068    if( taskID == null ) // need to stuff a fake id
069      {
070      String mapper = conf.getBoolean( "mapred.task.is.map", conf.getBoolean( "mapreduce.task.is.map", true ) ) ? "m" : "r";
071      String value = String.format( "attempt_%012d_0000_%s_000000_0", (int) Math.rint( System.currentTimeMillis() ), mapper );
072      conf.set( "mapred.task.id", value );
073      conf.set( "mapreduce.task.id", value );
074      }
075
076    makeTempPath( conf );
077
078    if( writeDirectlyToWorkingPath( conf, outputPath ) )
079      {
080      LOG.info( "writing directly to output path: {}", outputPath );
081      setWorkOutputPath( conf, outputPath );
082      return;
083      }
084
085    // "mapred.work.output.dir"
086    Path taskOutputPath = getTaskOutputPath( conf );
087    setWorkOutputPath( conf, taskOutputPath );
088    }
089
090  public static synchronized void setupTask( Configuration conf ) throws IOException
091    {
092    String workpath = conf.get( "mapred.work.output.dir" );
093
094    if( workpath == null )
095      return;
096
097    FileSystem fs = getFSSafe( conf, new Path( workpath ) );
098
099    if( fs == null )
100      return;
101
102    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
103
104    LOG.info( "setting up task: '{}' - {}", taskId, workpath );
105
106    AtomicInteger integer = pathCounts.get( workpath );
107
108    if( integer == null )
109      {
110      integer = new AtomicInteger();
111      pathCounts.put( workpath, integer );
112      }
113
114    integer.incrementAndGet();
115    }
116
117  public static boolean needsTaskCommit( Configuration conf ) throws IOException
118    {
119    String workpath = conf.get( "mapred.work.output.dir" );
120
121    if( workpath == null )
122      return false;
123
124    Path taskOutputPath = new Path( workpath );
125
126    if( taskOutputPath != null )
127      {
128      FileSystem fs = getFSSafe( conf, taskOutputPath );
129
130      if( fs == null )
131        return false;
132
133      if( fs.exists( taskOutputPath ) )
134        return true;
135      }
136
137    return false;
138    }
139
140  /**
141   * copies all files from the taskoutputpath to the outputpath
142   *
143   * @param conf
144   */
145  public static boolean commitTask( Configuration conf ) throws IOException
146    {
147    Path taskOutputPath = new Path( conf.get( "mapred.work.output.dir" ) );
148
149    FileSystem fs = getFSSafe( conf, taskOutputPath );
150
151    if( fs == null )
152      return false;
153
154    AtomicInteger integer = pathCounts.get( taskOutputPath.toString() );
155
156    if( integer.decrementAndGet() != 0 )
157      return false;
158
159    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
160
161    LOG.info( "committing task: '{}' - {}", taskId, taskOutputPath );
162
163    if( taskOutputPath != null )
164      {
165      if( writeDirectlyToWorkingPath( conf, taskOutputPath ) )
166        return true;
167
168      if( fs.exists( taskOutputPath ) )
169        {
170        Path jobOutputPath = taskOutputPath.getParent().getParent();
171        // Move the task outputs to their final place
172        moveTaskOutputs( conf, fs, jobOutputPath, taskOutputPath );
173
174        // Delete the temporary task-specific output directory
175        if( !fs.delete( taskOutputPath, true ) )
176          LOG.info( "failed to delete the temporary output directory of task: '{}' - {}", taskId, taskOutputPath );
177
178        LOG.info( "saved output of task '{}' to {}", taskId, jobOutputPath );
179        }
180      }
181
182    return true;
183    }
184
185  /**
186   * Called from flow step to remove temp dirs
187   *
188   * @param conf
189   * @throws IOException
190   */
191  public static void cleanupTapMetaData( Configuration conf, Tap tap ) throws IOException
192    {
193    cleanTempPath( conf, new Path( tap.getIdentifier() ) );
194    }
195
196  public static void writeSuccessMarker( Configuration conf ) throws IOException
197    {
198    writeSuccessMarker( conf, FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ) );
199    }
200
201  public static void writeSuccessMarker( Configuration conf, Path outputPath ) throws IOException
202    {
203    if( conf.getBoolean( "mapreduce.fileoutputcommitter.marksuccessfuljobs", true ) )
204      {
205      LOG.info( "writing success marker to {}", outputPath );
206
207      Path markerPath = new Path( outputPath, "_SUCCESS" );
208      FileSystem fs = markerPath.getFileSystem( conf );
209
210      fs.create( markerPath ).close();
211      }
212    }
213
214  /**
215   * May only be called once. should only be called if not in a flow
216   *
217   * @param conf
218   */
219  public static void cleanupJob( Configuration conf ) throws IOException
220    {
221    if( HadoopUtil.isInflow( conf ) )
222      return;
223
224    Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) );
225
226    cleanTempPath( conf, outputPath );
227    }
228
229  private static synchronized void cleanTempPath( Configuration conf, Path outputPath ) throws IOException
230    {
231    // do the clean up of temporary directory
232
233    if( outputPath != null )
234      {
235      FileSystem fileSys = getFSSafe( conf, outputPath );
236
237      if( fileSys == null )
238        return;
239
240      if( !fileSys.exists( outputPath ) )
241        return;
242
243      Path tmpDir = new Path( outputPath, TEMPORARY_PATH );
244
245      LOG.info( "deleting temp path {}", tmpDir );
246
247      if( fileSys.exists( tmpDir ) )
248        fileSys.delete( tmpDir, true );
249      }
250    }
251
252  private static FileSystem getFSSafe( Configuration conf, Path tmpDir )
253    {
254    try
255      {
256      return tmpDir.getFileSystem( conf );
257      }
258    catch( IOException e )
259      {
260      // ignore
261      }
262
263    return null;
264    }
265
266  private static Path getTaskOutputPath( Configuration conf )
267    {
268    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
269
270    Path p = new Path( FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId );
271
272    try
273      {
274      FileSystem fs = p.getFileSystem( conf );
275      return p.makeQualified( fs );
276      }
277    catch( IOException ie )
278      {
279      return p;
280      }
281    }
282
283  static void setWorkOutputPath( Configuration conf, Path outputDir )
284    {
285    outputDir = new Path( asJobConfInstance( conf ).getWorkingDirectory(), outputDir );
286    conf.set( "mapred.work.output.dir", outputDir.toString() );
287    }
288
289  public static void makeTempPath( Configuration conf ) throws IOException
290    {
291    // create job specific temporary directory in output path
292    Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) );
293
294    if( outputPath != null )
295      {
296      Path tmpDir = new Path( outputPath, TEMPORARY_PATH );
297      FileSystem fileSys = tmpDir.getFileSystem( conf );
298
299      // do not create the temp dir if write direct
300      if( isOutputWriteDirect( conf, fileSys ) )
301        return;
302
303      if( !fileSys.exists( tmpDir ) && !fileSys.mkdirs( tmpDir ) )
304        LOG.error( "mkdirs failed to create {}", tmpDir );
305      }
306    }
307
308  private static void moveTaskOutputs( Configuration conf, FileSystem fs, Path jobOutputDir, Path taskOutput ) throws IOException
309    {
310    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
311
312    if( fs.isFile( taskOutput ) )
313      {
314      Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) );
315      if( !fs.rename( taskOutput, finalOutputPath ) )
316        {
317        if( !fs.delete( finalOutputPath, true ) )
318          throw new IOException( "Failed to delete earlier output of task: " + taskId );
319
320        if( !fs.rename( taskOutput, finalOutputPath ) )
321          throw new IOException( "Failed to save output of task: " + taskId );
322        }
323
324      LOG.debug( "Moved {} to {}", taskOutput, finalOutputPath );
325      }
326    else if( fs.getFileStatus( taskOutput ).isDir() )
327      {
328      FileStatus[] paths = fs.listStatus( taskOutput );
329      Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) );
330      fs.mkdirs( finalOutputPath );
331      if( paths != null )
332        {
333        for( FileStatus path : paths )
334          moveTaskOutputs( conf, fs, jobOutputDir, path.getPath() );
335        }
336      }
337    }
338
339  private static Path getFinalPath( Path jobOutputDir, Path taskOutput, Path taskOutputPath ) throws IOException
340    {
341    URI taskOutputUri = taskOutput.toUri();
342    URI relativePath = taskOutputPath.toUri().relativize( taskOutputUri );
343    if( taskOutputUri == relativePath )
344      {//taskOutputPath is not a parent of taskOutput
345      throw new IOException( "Can not get the relative path: base = " + taskOutputPath + " child = " + taskOutput );
346      }
347    if( relativePath.getPath().length() > 0 )
348      {
349      return new Path( jobOutputDir, relativePath.getPath() );
350      }
351    else
352      {
353      return jobOutputDir;
354      }
355    }
356
357  /** used in AWS EMR to disable temp paths on some file systems, s3. */
358  private static boolean writeDirectlyToWorkingPath( Configuration conf, Path path )
359    {
360    FileSystem fs = getFSSafe( conf, path );
361
362    if( fs == null )
363      return false;
364
365    boolean result = isOutputWriteDirect( conf, fs );
366
367    if( result )
368      LOG.info( "output direct is enabled for this fs: " + fs.getName() );
369
370    return result;
371    }
372
373  protected static boolean isOutputWriteDirect( Configuration conf, FileSystem fs )
374    {
375    return conf.getBoolean( "mapred.output.direct." + fs.getClass().getSimpleName(), false );
376    }
377  }