001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.tap.hadoop.util;
022
023import java.io.IOException;
024import java.net.URI;
025import java.util.HashMap;
026import java.util.Map;
027import java.util.concurrent.atomic.AtomicInteger;
028
029import cascading.flow.hadoop.util.HadoopUtil;
030import cascading.tap.Tap;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FileStatus;
033import org.apache.hadoop.fs.FileSystem;
034import org.apache.hadoop.fs.Path;
035import org.apache.hadoop.mapred.FileOutputFormat;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance;
040
041public class Hadoop18TapUtil
042  {
043  /** The Hadoop temporary path used to prevent collisions */
044  public static final String TEMPORARY_PATH = "_temporary";
045  /** Field LOG */
046  private static final Logger LOG = LoggerFactory.getLogger( Hadoop18TapUtil.class );
047  private static final Map<String, AtomicInteger> pathCounts = new HashMap<String, AtomicInteger>();
048
049  /**
050   * should only be called if not in a Flow
051   *
052   * @param conf
053   * @throws IOException
054   */
055  public static void setupJob( Configuration conf ) throws IOException
056    {
057    Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) );
058
059    if( outputPath == null )
060      return;
061
062    if( getFSSafe( conf, outputPath ) == null )
063      return;
064
065    String taskID = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
066
067    if( taskID == null ) // need to stuff a fake id
068      {
069      String mapper = conf.getBoolean( "mapred.task.is.map", conf.getBoolean( "mapreduce.task.is.map", true ) ) ? "m" : "r";
070      String value = String.format( "attempt_%012d_0000_%s_000000_0", (int) Math.rint( System.currentTimeMillis() ), mapper );
071      conf.set( "mapred.task.id", value );
072      conf.set( "mapreduce.task.id", value );
073      }
074
075    makeTempPath( conf );
076
077    if( writeDirectlyToWorkingPath( conf, outputPath ) )
078      {
079      LOG.info( "writing directly to output path: {}", outputPath );
080      setWorkOutputPath( conf, outputPath );
081      return;
082      }
083
084    // "mapred.work.output.dir"
085    Path taskOutputPath = getTaskOutputPath( conf );
086    setWorkOutputPath( conf, taskOutputPath );
087    }
088
089  public static synchronized void setupTask( Configuration conf ) throws IOException
090    {
091    String workpath = conf.get( "mapred.work.output.dir" );
092
093    if( workpath == null )
094      return;
095
096    FileSystem fs = getFSSafe( conf, new Path( workpath ) );
097
098    if( fs == null )
099      return;
100
101    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
102
103    LOG.info( "setting up task: '{}' - {}", taskId, workpath );
104
105    AtomicInteger integer = pathCounts.get( workpath );
106
107    if( integer == null )
108      {
109      integer = new AtomicInteger();
110      pathCounts.put( workpath, integer );
111      }
112
113    integer.incrementAndGet();
114    }
115
116  public static boolean needsTaskCommit( Configuration conf ) throws IOException
117    {
118    String workpath = conf.get( "mapred.work.output.dir" );
119
120    if( workpath == null )
121      return false;
122
123    Path taskOutputPath = new Path( workpath );
124
125    if( taskOutputPath != null )
126      {
127      FileSystem fs = getFSSafe( conf, taskOutputPath );
128
129      if( fs == null )
130        return false;
131
132      if( fs.exists( taskOutputPath ) )
133        return true;
134      }
135
136    return false;
137    }
138
139  /**
140   * copies all files from the taskoutputpath to the outputpath
141   *
142   * @param conf
143   */
144  public static boolean commitTask( Configuration conf ) throws IOException
145    {
146    Path taskOutputPath = new Path( conf.get( "mapred.work.output.dir" ) );
147
148    FileSystem fs = getFSSafe( conf, taskOutputPath );
149
150    if( fs == null )
151      return false;
152
153    AtomicInteger integer = pathCounts.get( taskOutputPath.toString() );
154
155    if( integer.decrementAndGet() != 0 )
156      return false;
157
158    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
159
160    LOG.info( "committing task: '{}' - {}", taskId, taskOutputPath );
161
162    if( taskOutputPath != null )
163      {
164      if( writeDirectlyToWorkingPath( conf, taskOutputPath ) )
165        return true;
166
167      if( fs.exists( taskOutputPath ) )
168        {
169        Path jobOutputPath = taskOutputPath.getParent().getParent();
170        // Move the task outputs to their final place
171        moveTaskOutputs( conf, fs, jobOutputPath, taskOutputPath );
172
173        // Delete the temporary task-specific output directory
174        if( !fs.delete( taskOutputPath, true ) )
175          LOG.info( "failed to delete the temporary output directory of task: '{}' - {}", taskId, taskOutputPath );
176
177        LOG.info( "saved output of task '{}' to {}", taskId, jobOutputPath );
178        }
179      }
180
181    return true;
182    }
183
184  /**
185   * Called from flow step to remove temp dirs
186   *
187   * @param conf
188   * @throws IOException
189   */
190  public static void cleanupTapMetaData( Configuration conf, Tap tap ) throws IOException
191    {
192    cleanTempPath( conf, new Path( tap.getIdentifier() ) );
193    }
194
195  public static void writeSuccessMarker( Configuration conf ) throws IOException
196    {
197    writeSuccessMarker( conf, FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ) );
198    }
199
200  public static void writeSuccessMarker( Configuration conf, Path outputPath ) throws IOException
201    {
202    if( conf.getBoolean( "mapreduce.fileoutputcommitter.marksuccessfuljobs", true ) )
203      {
204      LOG.info( "writing success marker to {}", outputPath );
205
206      Path markerPath = new Path( outputPath, "_SUCCESS" );
207      FileSystem fs = markerPath.getFileSystem( conf );
208
209      fs.create( markerPath ).close();
210      }
211    }
212
213  /**
214   * May only be called once. should only be called if not in a flow
215   *
216   * @param conf
217   */
218  public static void cleanupJob( Configuration conf ) throws IOException
219    {
220    if( HadoopUtil.isInflow( conf ) )
221      return;
222
223    Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) );
224
225    cleanTempPath( conf, outputPath );
226    }
227
228  private static synchronized void cleanTempPath( Configuration conf, Path outputPath ) throws IOException
229    {
230    // do the clean up of temporary directory
231
232    if( outputPath != null )
233      {
234      FileSystem fileSys = getFSSafe( conf, outputPath );
235
236      if( fileSys == null )
237        return;
238
239      if( !fileSys.exists( outputPath ) )
240        return;
241
242      Path tmpDir = new Path( outputPath, TEMPORARY_PATH );
243
244      LOG.info( "deleting temp path {}", tmpDir );
245
246      if( fileSys.exists( tmpDir ) )
247        fileSys.delete( tmpDir, true );
248      }
249    }
250
251  private static FileSystem getFSSafe( Configuration conf, Path tmpDir )
252    {
253    try
254      {
255      return tmpDir.getFileSystem( conf );
256      }
257    catch( IOException e )
258      {
259      // ignore
260      }
261
262    return null;
263    }
264
265  private static Path getTaskOutputPath( Configuration conf )
266    {
267    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
268
269    Path p = new Path( FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId );
270
271    try
272      {
273      FileSystem fs = p.getFileSystem( conf );
274      return p.makeQualified( fs );
275      }
276    catch( IOException ie )
277      {
278      return p;
279      }
280    }
281
282  static void setWorkOutputPath( Configuration conf, Path outputDir )
283    {
284    outputDir = new Path( asJobConfInstance( conf ).getWorkingDirectory(), outputDir );
285    conf.set( "mapred.work.output.dir", outputDir.toString() );
286    }
287
288  public static void makeTempPath( Configuration conf ) throws IOException
289    {
290    // create job specific temporary directory in output path
291    Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) );
292
293    if( outputPath != null )
294      {
295      Path tmpDir = new Path( outputPath, TEMPORARY_PATH );
296      FileSystem fileSys = tmpDir.getFileSystem( conf );
297
298      // do not create the temp dir if write direct
299      if( isOutputWriteDirect( conf, fileSys ) )
300        return;
301
302      if( !fileSys.exists( tmpDir ) && !fileSys.mkdirs( tmpDir ) )
303        LOG.error( "mkdirs failed to create {}", tmpDir );
304      }
305    }
306
307  private static void moveTaskOutputs( Configuration conf, FileSystem fs, Path jobOutputDir, Path taskOutput ) throws IOException
308    {
309    String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) );
310
311    if( fs.isFile( taskOutput ) )
312      {
313      Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) );
314      if( !fs.rename( taskOutput, finalOutputPath ) )
315        {
316        if( !fs.delete( finalOutputPath, true ) )
317          throw new IOException( "Failed to delete earlier output of task: " + taskId );
318
319        if( !fs.rename( taskOutput, finalOutputPath ) )
320          throw new IOException( "Failed to save output of task: " + taskId );
321        }
322
323      LOG.debug( "Moved {} to {}", taskOutput, finalOutputPath );
324      }
325    else if( fs.getFileStatus( taskOutput ).isDir() )
326      {
327      FileStatus[] paths = fs.listStatus( taskOutput );
328      Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) );
329      fs.mkdirs( finalOutputPath );
330      if( paths != null )
331        {
332        for( FileStatus path : paths )
333          moveTaskOutputs( conf, fs, jobOutputDir, path.getPath() );
334        }
335      }
336    }
337
338  private static Path getFinalPath( Path jobOutputDir, Path taskOutput, Path taskOutputPath ) throws IOException
339    {
340    URI taskOutputUri = taskOutput.toUri();
341    URI relativePath = taskOutputPath.toUri().relativize( taskOutputUri );
342    if( taskOutputUri == relativePath )
343      {//taskOutputPath is not a parent of taskOutput
344      throw new IOException( "Can not get the relative path: base = " + taskOutputPath + " child = " + taskOutput );
345      }
346    if( relativePath.getPath().length() > 0 )
347      {
348      return new Path( jobOutputDir, relativePath.getPath() );
349      }
350    else
351      {
352      return jobOutputDir;
353      }
354    }
355
356  /** used in AWS EMR to disable temp paths on some file systems, s3. */
357  private static boolean writeDirectlyToWorkingPath( Configuration conf, Path path )
358    {
359    FileSystem fs = getFSSafe( conf, path );
360
361    if( fs == null )
362      return false;
363
364    boolean result = isOutputWriteDirect( conf, fs );
365
366    if( result )
367      LOG.info( "output direct is enabled for this fs: " + fs.getName() );
368
369    return result;
370    }
371
372  protected static boolean isOutputWriteDirect( Configuration conf, FileSystem fs )
373    {
374    return conf.getBoolean( "mapred.output.direct." + fs.getClass().getSimpleName(), false );
375    }
376  }