001/* 002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.tap.hadoop.util; 023 024import java.io.IOException; 025import java.net.URI; 026import java.util.HashMap; 027import java.util.Map; 028import java.util.concurrent.atomic.AtomicInteger; 029 030import cascading.flow.hadoop.util.HadoopUtil; 031import cascading.tap.Tap; 032import org.apache.hadoop.conf.Configuration; 033import org.apache.hadoop.fs.FileStatus; 034import org.apache.hadoop.fs.FileSystem; 035import org.apache.hadoop.fs.Path; 036import org.apache.hadoop.mapred.FileOutputFormat; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance; 041 042public class Hadoop18TapUtil 043 { 044 /** The Hadoop temporary path used to prevent collisions */ 045 public static final String TEMPORARY_PATH = "_temporary"; 046 /** Field LOG */ 047 private static final Logger LOG = LoggerFactory.getLogger( Hadoop18TapUtil.class ); 048 private static final Map<String, AtomicInteger> pathCounts = new HashMap<String, AtomicInteger>(); 049 050 /** 051 * should only be called if not in a Flow 052 * 053 * @param conf 054 * @throws IOException 055 */ 056 public static void setupJob( Configuration conf ) throws IOException 057 { 058 Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ); 059 060 if( outputPath == null ) 061 return; 062 063 if( getFSSafe( conf, outputPath ) == null ) 064 return; 065 066 String taskID = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 067 068 if( taskID == null ) // need to stuff a fake id 069 { 070 String mapper = conf.getBoolean( "mapred.task.is.map", conf.getBoolean( "mapreduce.task.is.map", true ) ) ? "m" : "r"; 071 String value = String.format( "attempt_%012d_0000_%s_000000_0", (int) Math.rint( System.currentTimeMillis() ), mapper ); 072 conf.set( "mapred.task.id", value ); 073 conf.set( "mapreduce.task.id", value ); 074 } 075 076 makeTempPath( conf ); 077 078 if( writeDirectlyToWorkingPath( conf, outputPath ) ) 079 { 080 LOG.info( "writing directly to output path: {}", outputPath ); 081 setWorkOutputPath( conf, outputPath ); 082 return; 083 } 084 085 // "mapred.work.output.dir" 086 Path taskOutputPath = getTaskOutputPath( conf ); 087 setWorkOutputPath( conf, taskOutputPath ); 088 } 089 090 public static synchronized void setupTask( Configuration conf ) throws IOException 091 { 092 String workpath = conf.get( "mapred.work.output.dir" ); 093 094 if( workpath == null ) 095 return; 096 097 FileSystem fs = getFSSafe( conf, new Path( workpath ) ); 098 099 if( fs == null ) 100 return; 101 102 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 103 104 LOG.info( "setting up task: '{}' - {}", taskId, workpath ); 105 106 AtomicInteger integer = pathCounts.get( workpath ); 107 108 if( integer == null ) 109 { 110 integer = new AtomicInteger(); 111 pathCounts.put( workpath, integer ); 112 } 113 114 integer.incrementAndGet(); 115 } 116 117 public static boolean needsTaskCommit( Configuration conf ) throws IOException 118 { 119 String workpath = conf.get( "mapred.work.output.dir" ); 120 121 if( workpath == null ) 122 return false; 123 124 Path taskOutputPath = new Path( workpath ); 125 126 if( taskOutputPath != null ) 127 { 128 FileSystem fs = getFSSafe( conf, taskOutputPath ); 129 130 if( fs == null ) 131 return false; 132 133 if( fs.exists( taskOutputPath ) ) 134 return true; 135 } 136 137 return false; 138 } 139 140 /** 141 * copies all files from the taskoutputpath to the outputpath 142 * 143 * @param conf 144 */ 145 public static boolean commitTask( Configuration conf ) throws IOException 146 { 147 Path taskOutputPath = new Path( conf.get( "mapred.work.output.dir" ) ); 148 149 FileSystem fs = getFSSafe( conf, taskOutputPath ); 150 151 if( fs == null ) 152 return false; 153 154 AtomicInteger integer = pathCounts.get( taskOutputPath.toString() ); 155 156 if( integer.decrementAndGet() != 0 ) 157 return false; 158 159 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 160 161 LOG.info( "committing task: '{}' - {}", taskId, taskOutputPath ); 162 163 if( taskOutputPath != null ) 164 { 165 if( writeDirectlyToWorkingPath( conf, taskOutputPath ) ) 166 return true; 167 168 if( fs.exists( taskOutputPath ) ) 169 { 170 Path jobOutputPath = taskOutputPath.getParent().getParent(); 171 // Move the task outputs to their final place 172 moveTaskOutputs( conf, fs, jobOutputPath, taskOutputPath ); 173 174 // Delete the temporary task-specific output directory 175 if( !fs.delete( taskOutputPath, true ) ) 176 LOG.info( "failed to delete the temporary output directory of task: '{}' - {}", taskId, taskOutputPath ); 177 178 LOG.info( "saved output of task '{}' to {}", taskId, jobOutputPath ); 179 } 180 } 181 182 return true; 183 } 184 185 /** 186 * Called from flow step to remove temp dirs 187 * 188 * @param conf 189 * @throws IOException 190 */ 191 public static void cleanupTapMetaData( Configuration conf, Tap tap ) throws IOException 192 { 193 cleanTempPath( conf, new Path( tap.getIdentifier() ) ); 194 } 195 196 public static void writeSuccessMarker( Configuration conf ) throws IOException 197 { 198 writeSuccessMarker( conf, FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ) ); 199 } 200 201 public static void writeSuccessMarker( Configuration conf, Path outputPath ) throws IOException 202 { 203 if( conf.getBoolean( "mapreduce.fileoutputcommitter.marksuccessfuljobs", true ) ) 204 { 205 LOG.info( "writing success marker to {}", outputPath ); 206 207 Path markerPath = new Path( outputPath, "_SUCCESS" ); 208 FileSystem fs = markerPath.getFileSystem( conf ); 209 210 fs.create( markerPath ).close(); 211 } 212 } 213 214 /** 215 * May only be called once. should only be called if not in a flow 216 * 217 * @param conf 218 */ 219 public static void cleanupJob( Configuration conf ) throws IOException 220 { 221 if( HadoopUtil.isInflow( conf ) ) 222 return; 223 224 Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ); 225 226 cleanTempPath( conf, outputPath ); 227 } 228 229 private static synchronized void cleanTempPath( Configuration conf, Path outputPath ) throws IOException 230 { 231 // do the clean up of temporary directory 232 233 if( outputPath != null ) 234 { 235 FileSystem fileSys = getFSSafe( conf, outputPath ); 236 237 if( fileSys == null ) 238 return; 239 240 if( !fileSys.exists( outputPath ) ) 241 return; 242 243 Path tmpDir = new Path( outputPath, TEMPORARY_PATH ); 244 245 LOG.info( "deleting temp path {}", tmpDir ); 246 247 if( fileSys.exists( tmpDir ) ) 248 fileSys.delete( tmpDir, true ); 249 } 250 } 251 252 private static FileSystem getFSSafe( Configuration conf, Path tmpDir ) 253 { 254 try 255 { 256 return tmpDir.getFileSystem( conf ); 257 } 258 catch( IOException e ) 259 { 260 // ignore 261 } 262 263 return null; 264 } 265 266 private static Path getTaskOutputPath( Configuration conf ) 267 { 268 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 269 270 Path p = new Path( FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId ); 271 272 try 273 { 274 FileSystem fs = p.getFileSystem( conf ); 275 return p.makeQualified( fs ); 276 } 277 catch( IOException ie ) 278 { 279 return p; 280 } 281 } 282 283 static void setWorkOutputPath( Configuration conf, Path outputDir ) 284 { 285 outputDir = new Path( asJobConfInstance( conf ).getWorkingDirectory(), outputDir ); 286 conf.set( "mapred.work.output.dir", outputDir.toString() ); 287 } 288 289 public static void makeTempPath( Configuration conf ) throws IOException 290 { 291 // create job specific temporary directory in output path 292 Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ); 293 294 if( outputPath != null ) 295 { 296 Path tmpDir = new Path( outputPath, TEMPORARY_PATH ); 297 FileSystem fileSys = tmpDir.getFileSystem( conf ); 298 299 // do not create the temp dir if write direct 300 if( isOutputWriteDirect( conf, fileSys ) ) 301 return; 302 303 if( !fileSys.exists( tmpDir ) && !fileSys.mkdirs( tmpDir ) ) 304 LOG.error( "mkdirs failed to create {}", tmpDir ); 305 } 306 } 307 308 private static void moveTaskOutputs( Configuration conf, FileSystem fs, Path jobOutputDir, Path taskOutput ) throws IOException 309 { 310 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 311 312 if( fs.isFile( taskOutput ) ) 313 { 314 Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) ); 315 if( !fs.rename( taskOutput, finalOutputPath ) ) 316 { 317 if( !fs.delete( finalOutputPath, true ) ) 318 throw new IOException( "Failed to delete earlier output of task: " + taskId ); 319 320 if( !fs.rename( taskOutput, finalOutputPath ) ) 321 throw new IOException( "Failed to save output of task: " + taskId ); 322 } 323 324 LOG.debug( "Moved {} to {}", taskOutput, finalOutputPath ); 325 } 326 else if( fs.getFileStatus( taskOutput ).isDir() ) 327 { 328 FileStatus[] paths = fs.listStatus( taskOutput ); 329 Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) ); 330 fs.mkdirs( finalOutputPath ); 331 if( paths != null ) 332 { 333 for( FileStatus path : paths ) 334 moveTaskOutputs( conf, fs, jobOutputDir, path.getPath() ); 335 } 336 } 337 } 338 339 private static Path getFinalPath( Path jobOutputDir, Path taskOutput, Path taskOutputPath ) throws IOException 340 { 341 URI taskOutputUri = taskOutput.toUri(); 342 URI relativePath = taskOutputPath.toUri().relativize( taskOutputUri ); 343 if( taskOutputUri == relativePath ) 344 {//taskOutputPath is not a parent of taskOutput 345 throw new IOException( "Can not get the relative path: base = " + taskOutputPath + " child = " + taskOutput ); 346 } 347 if( relativePath.getPath().length() > 0 ) 348 { 349 return new Path( jobOutputDir, relativePath.getPath() ); 350 } 351 else 352 { 353 return jobOutputDir; 354 } 355 } 356 357 /** used in AWS EMR to disable temp paths on some file systems, s3. */ 358 private static boolean writeDirectlyToWorkingPath( Configuration conf, Path path ) 359 { 360 FileSystem fs = getFSSafe( conf, path ); 361 362 if( fs == null ) 363 return false; 364 365 boolean result = isOutputWriteDirect( conf, fs ); 366 367 if( result ) 368 LOG.info( "output direct is enabled for this fs: " + fs.getName() ); 369 370 return result; 371 } 372 373 protected static boolean isOutputWriteDirect( Configuration conf, FileSystem fs ) 374 { 375 return conf.getBoolean( "mapred.output.direct." + fs.getClass().getSimpleName(), false ); 376 } 377 }