001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.tap.hadoop.util; 022 023import java.io.IOException; 024import java.net.URI; 025import java.util.HashMap; 026import java.util.Map; 027import java.util.concurrent.atomic.AtomicInteger; 028 029import cascading.flow.hadoop.util.HadoopUtil; 030import cascading.tap.Tap; 031import org.apache.hadoop.conf.Configuration; 032import org.apache.hadoop.fs.FileStatus; 033import org.apache.hadoop.fs.FileSystem; 034import org.apache.hadoop.fs.Path; 035import org.apache.hadoop.mapred.FileOutputFormat; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance; 040 041public class Hadoop18TapUtil 042 { 043 /** The Hadoop temporary path used to prevent collisions */ 044 public static final String TEMPORARY_PATH = "_temporary"; 045 /** Field LOG */ 046 private static final Logger LOG = LoggerFactory.getLogger( Hadoop18TapUtil.class ); 047 private static final Map<String, AtomicInteger> pathCounts = new HashMap<String, AtomicInteger>(); 048 049 /** 050 * should only be called if not in a Flow 051 * 052 * @param conf 053 * @throws IOException 054 */ 055 public static void setupJob( Configuration conf ) throws IOException 056 { 057 Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ); 058 059 if( outputPath == null ) 060 return; 061 062 if( getFSSafe( conf, outputPath ) == null ) 063 return; 064 065 String taskID = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 066 067 if( taskID == null ) // need to stuff a fake id 068 { 069 String mapper = conf.getBoolean( "mapred.task.is.map", conf.getBoolean( "mapreduce.task.is.map", true ) ) ? "m" : "r"; 070 String value = String.format( "attempt_%012d_0000_%s_000000_0", (int) Math.rint( System.currentTimeMillis() ), mapper ); 071 conf.set( "mapred.task.id", value ); 072 conf.set( "mapreduce.task.id", value ); 073 } 074 075 makeTempPath( conf ); 076 077 if( writeDirectlyToWorkingPath( conf, outputPath ) ) 078 { 079 LOG.info( "writing directly to output path: {}", outputPath ); 080 setWorkOutputPath( conf, outputPath ); 081 return; 082 } 083 084 // "mapred.work.output.dir" 085 Path taskOutputPath = getTaskOutputPath( conf ); 086 setWorkOutputPath( conf, taskOutputPath ); 087 } 088 089 public static synchronized void setupTask( Configuration conf ) throws IOException 090 { 091 String workpath = conf.get( "mapred.work.output.dir" ); 092 093 if( workpath == null ) 094 return; 095 096 FileSystem fs = getFSSafe( conf, new Path( workpath ) ); 097 098 if( fs == null ) 099 return; 100 101 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 102 103 LOG.info( "setting up task: '{}' - {}", taskId, workpath ); 104 105 AtomicInteger integer = pathCounts.get( workpath ); 106 107 if( integer == null ) 108 { 109 integer = new AtomicInteger(); 110 pathCounts.put( workpath, integer ); 111 } 112 113 integer.incrementAndGet(); 114 } 115 116 public static boolean needsTaskCommit( Configuration conf ) throws IOException 117 { 118 String workpath = conf.get( "mapred.work.output.dir" ); 119 120 if( workpath == null ) 121 return false; 122 123 Path taskOutputPath = new Path( workpath ); 124 125 if( taskOutputPath != null ) 126 { 127 FileSystem fs = getFSSafe( conf, taskOutputPath ); 128 129 if( fs == null ) 130 return false; 131 132 if( fs.exists( taskOutputPath ) ) 133 return true; 134 } 135 136 return false; 137 } 138 139 /** 140 * copies all files from the taskoutputpath to the outputpath 141 * 142 * @param conf 143 */ 144 public static boolean commitTask( Configuration conf ) throws IOException 145 { 146 Path taskOutputPath = new Path( conf.get( "mapred.work.output.dir" ) ); 147 148 FileSystem fs = getFSSafe( conf, taskOutputPath ); 149 150 if( fs == null ) 151 return false; 152 153 AtomicInteger integer = pathCounts.get( taskOutputPath.toString() ); 154 155 if( integer.decrementAndGet() != 0 ) 156 return false; 157 158 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 159 160 LOG.info( "committing task: '{}' - {}", taskId, taskOutputPath ); 161 162 if( taskOutputPath != null ) 163 { 164 if( writeDirectlyToWorkingPath( conf, taskOutputPath ) ) 165 return true; 166 167 if( fs.exists( taskOutputPath ) ) 168 { 169 Path jobOutputPath = taskOutputPath.getParent().getParent(); 170 // Move the task outputs to their final place 171 moveTaskOutputs( conf, fs, jobOutputPath, taskOutputPath ); 172 173 // Delete the temporary task-specific output directory 174 if( !fs.delete( taskOutputPath, true ) ) 175 LOG.info( "failed to delete the temporary output directory of task: '{}' - {}", taskId, taskOutputPath ); 176 177 LOG.info( "saved output of task '{}' to {}", taskId, jobOutputPath ); 178 } 179 } 180 181 return true; 182 } 183 184 /** 185 * Called from flow step to remove temp dirs 186 * 187 * @param conf 188 * @throws IOException 189 */ 190 public static void cleanupTapMetaData( Configuration conf, Tap tap ) throws IOException 191 { 192 cleanTempPath( conf, new Path( tap.getIdentifier() ) ); 193 } 194 195 public static void writeSuccessMarker( Configuration conf ) throws IOException 196 { 197 writeSuccessMarker( conf, FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ) ); 198 } 199 200 public static void writeSuccessMarker( Configuration conf, Path outputPath ) throws IOException 201 { 202 if( conf.getBoolean( "mapreduce.fileoutputcommitter.marksuccessfuljobs", true ) ) 203 { 204 LOG.info( "writing success marker to {}", outputPath ); 205 206 Path markerPath = new Path( outputPath, "_SUCCESS" ); 207 FileSystem fs = markerPath.getFileSystem( conf ); 208 209 fs.create( markerPath ).close(); 210 } 211 } 212 213 /** 214 * May only be called once. should only be called if not in a flow 215 * 216 * @param conf 217 */ 218 public static void cleanupJob( Configuration conf ) throws IOException 219 { 220 if( HadoopUtil.isInflow( conf ) ) 221 return; 222 223 Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ); 224 225 cleanTempPath( conf, outputPath ); 226 } 227 228 private static synchronized void cleanTempPath( Configuration conf, Path outputPath ) throws IOException 229 { 230 // do the clean up of temporary directory 231 232 if( outputPath != null ) 233 { 234 FileSystem fileSys = getFSSafe( conf, outputPath ); 235 236 if( fileSys == null ) 237 return; 238 239 if( !fileSys.exists( outputPath ) ) 240 return; 241 242 Path tmpDir = new Path( outputPath, TEMPORARY_PATH ); 243 244 LOG.info( "deleting temp path {}", tmpDir ); 245 246 if( fileSys.exists( tmpDir ) ) 247 fileSys.delete( tmpDir, true ); 248 } 249 } 250 251 private static FileSystem getFSSafe( Configuration conf, Path tmpDir ) 252 { 253 try 254 { 255 return tmpDir.getFileSystem( conf ); 256 } 257 catch( IOException e ) 258 { 259 // ignore 260 } 261 262 return null; 263 } 264 265 private static Path getTaskOutputPath( Configuration conf ) 266 { 267 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 268 269 Path p = new Path( FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId ); 270 271 try 272 { 273 FileSystem fs = p.getFileSystem( conf ); 274 return p.makeQualified( fs ); 275 } 276 catch( IOException ie ) 277 { 278 return p; 279 } 280 } 281 282 static void setWorkOutputPath( Configuration conf, Path outputDir ) 283 { 284 outputDir = new Path( asJobConfInstance( conf ).getWorkingDirectory(), outputDir ); 285 conf.set( "mapred.work.output.dir", outputDir.toString() ); 286 } 287 288 public static void makeTempPath( Configuration conf ) throws IOException 289 { 290 // create job specific temporary directory in output path 291 Path outputPath = FileOutputFormat.getOutputPath( asJobConfInstance( conf ) ); 292 293 if( outputPath != null ) 294 { 295 Path tmpDir = new Path( outputPath, TEMPORARY_PATH ); 296 FileSystem fileSys = tmpDir.getFileSystem( conf ); 297 298 // do not create the temp dir if write direct 299 if( isOutputWriteDirect( conf, fileSys ) ) 300 return; 301 302 if( !fileSys.exists( tmpDir ) && !fileSys.mkdirs( tmpDir ) ) 303 LOG.error( "mkdirs failed to create {}", tmpDir ); 304 } 305 } 306 307 private static void moveTaskOutputs( Configuration conf, FileSystem fs, Path jobOutputDir, Path taskOutput ) throws IOException 308 { 309 String taskId = conf.get( "mapred.task.id", conf.get( "mapreduce.task.id" ) ); 310 311 if( fs.isFile( taskOutput ) ) 312 { 313 Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) ); 314 if( !fs.rename( taskOutput, finalOutputPath ) ) 315 { 316 if( !fs.delete( finalOutputPath, true ) ) 317 throw new IOException( "Failed to delete earlier output of task: " + taskId ); 318 319 if( !fs.rename( taskOutput, finalOutputPath ) ) 320 throw new IOException( "Failed to save output of task: " + taskId ); 321 } 322 323 LOG.debug( "Moved {} to {}", taskOutput, finalOutputPath ); 324 } 325 else if( fs.getFileStatus( taskOutput ).isDir() ) 326 { 327 FileStatus[] paths = fs.listStatus( taskOutput ); 328 Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) ); 329 fs.mkdirs( finalOutputPath ); 330 if( paths != null ) 331 { 332 for( FileStatus path : paths ) 333 moveTaskOutputs( conf, fs, jobOutputDir, path.getPath() ); 334 } 335 } 336 } 337 338 private static Path getFinalPath( Path jobOutputDir, Path taskOutput, Path taskOutputPath ) throws IOException 339 { 340 URI taskOutputUri = taskOutput.toUri(); 341 URI relativePath = taskOutputPath.toUri().relativize( taskOutputUri ); 342 if( taskOutputUri == relativePath ) 343 {//taskOutputPath is not a parent of taskOutput 344 throw new IOException( "Can not get the relative path: base = " + taskOutputPath + " child = " + taskOutput ); 345 } 346 if( relativePath.getPath().length() > 0 ) 347 { 348 return new Path( jobOutputDir, relativePath.getPath() ); 349 } 350 else 351 { 352 return jobOutputDir; 353 } 354 } 355 356 /** used in AWS EMR to disable temp paths on some file systems, s3. */ 357 private static boolean writeDirectlyToWorkingPath( Configuration conf, Path path ) 358 { 359 FileSystem fs = getFSSafe( conf, path ); 360 361 if( fs == null ) 362 return false; 363 364 boolean result = isOutputWriteDirect( conf, fs ); 365 366 if( result ) 367 LOG.info( "output direct is enabled for this fs: " + fs.getName() ); 368 369 return result; 370 } 371 372 protected static boolean isOutputWriteDirect( Configuration conf, FileSystem fs ) 373 { 374 return conf.getBoolean( "mapred.output.direct." + fs.getClass().getSimpleName(), false ); 375 } 376 }