001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.flow.hadoop.util; 022 023import java.io.IOException; 024import java.io.UnsupportedEncodingException; 025import java.lang.reflect.Constructor; 026import java.lang.reflect.Field; 027import java.lang.reflect.InvocationTargetException; 028import java.net.URI; 029import java.net.URL; 030import java.util.Collection; 031import java.util.Collections; 032import java.util.HashMap; 033import java.util.HashSet; 034import java.util.Iterator; 035import java.util.List; 036import java.util.Map; 037import java.util.Properties; 038import java.util.Set; 039import java.util.jar.Attributes; 040import java.util.jar.Manifest; 041 042import cascading.CascadingException; 043import cascading.flow.FlowException; 044import cascading.flow.planner.BaseFlowStep; 045import cascading.flow.planner.PlatformInfo; 046import cascading.flow.planner.Scope; 047import cascading.pipe.Group; 048import cascading.scheme.hadoop.TextLine; 049import cascading.tap.hadoop.Hfs; 050import cascading.tuple.Fields; 051import cascading.util.LogUtil; 052import cascading.util.Util; 053import org.apache.commons.codec.binary.Base64; 054import org.apache.hadoop.conf.Configurable; 055import org.apache.hadoop.conf.Configuration; 056import org.apache.hadoop.fs.FileStatus; 057import org.apache.hadoop.fs.FileSystem; 058import org.apache.hadoop.fs.LocalFileSystem; 059import org.apache.hadoop.fs.Path; 060import org.apache.hadoop.mapred.JobConf; 061import org.apache.hadoop.util.StringUtils; 062import org.slf4j.Logger; 063import org.slf4j.LoggerFactory; 064 065import static cascading.util.Util.invokeInstanceMethod; 066 067/** 068 * 069 */ 070public class HadoopUtil 071 { 072 public static final String CASCADING_FLOW_EXECUTING = "cascading.flow.executing"; 073 074 private static final Logger LOG = LoggerFactory.getLogger( HadoopUtil.class ); 075 private static final String ENCODING = "US-ASCII"; 076 private static final Class<?> DEFAULT_OBJECT_SERIALIZER = JavaObjectSerializer.class; 077 078 private static PlatformInfo platformInfo; 079 080 public static void setIsInflow( Configuration conf ) 081 { 082 conf.setBoolean( CASCADING_FLOW_EXECUTING, true ); 083 } 084 085 public static boolean isInflow( Configuration conf ) 086 { 087 return conf.getBoolean( CASCADING_FLOW_EXECUTING, false ); 088 } 089 090 public static void initLog4j( JobConf configuration ) 091 { 092 initLog4j( (Configuration) configuration ); 093 } 094 095 public static void initLog4j( Configuration configuration ) 096 { 097 String values = configuration.get( "log4j.logger", null ); 098 099 if( values == null || values.length() == 0 ) 100 return; 101 102 if( !Util.hasClass( "org.apache.log4j.Logger" ) ) 103 { 104 LOG.info( "org.apache.log4j.Logger is not in the current CLASSPATH, not setting log4j.logger properties" ); 105 return; 106 } 107 108 String[] elements = values.split( "," ); 109 110 for( String element : elements ) 111 LogUtil.setLog4jLevel( element.split( "=" ) ); 112 } 113 114 // only place JobConf should ever be returned 115 public static JobConf asJobConfInstance( Configuration configuration ) 116 { 117 if( configuration instanceof JobConf ) 118 return (JobConf) configuration; 119 120 return new JobConf( configuration ); 121 } 122 123 public static <C> C copyJobConf( C parentJobConf ) 124 { 125 return copyConfiguration( parentJobConf ); 126 } 127 128 public static JobConf copyJobConf( JobConf parentJobConf ) 129 { 130 if( parentJobConf == null ) 131 throw new IllegalArgumentException( "parent may not be null" ); 132 133 // see https://github.com/Cascading/cascading/pull/21 134 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in 135 // case those Credentials are mutated later on down the road (which they will be, during job submission, in 136 // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. 137 final Configuration configurationCopy = new Configuration( parentJobConf ); 138 final JobConf jobConf = new JobConf( configurationCopy ); 139 140 jobConf.getCredentials().addAll( parentJobConf.getCredentials() ); 141 142 return jobConf; 143 } 144 145 public static JobConf createJobConf( Map<Object, Object> properties ) 146 { 147 return createJobConf( properties, null ); 148 } 149 150 public static JobConf createJobConf( Map<Object, Object> properties, JobConf defaultJobconf ) 151 { 152 JobConf jobConf = defaultJobconf == null ? new JobConf() : copyJobConf( defaultJobconf ); 153 154 if( properties == null ) 155 return jobConf; 156 157 return copyConfiguration( properties, jobConf ); 158 } 159 160 public static <C> C copyConfiguration( C parent ) 161 { 162 if( parent == null ) 163 throw new IllegalArgumentException( "parent may not be null" ); 164 165 if( !( parent instanceof Configuration ) ) 166 throw new IllegalArgumentException( "parent must be of type Configuration" ); 167 168 Configuration conf = (Configuration) parent; 169 170 // see https://github.com/Cascading/cascading/pull/21 171 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in 172 // case those Credentials are mutated later on down the road (which they will be, during job submission, in 173 // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. 174 Configuration configurationCopy = new Configuration( conf ); 175 176 Configuration copiedConf = callCopyConstructor( parent.getClass(), configurationCopy ); 177 178 if( Util.hasInstanceMethod( parent, "getCredentials", null ) ) 179 { 180 Object result = invokeInstanceMethod( parent, "getCredentials", null, null ); 181 Object credentials = invokeInstanceMethod( copiedConf, "getCredentials", null, null ); 182 183 invokeInstanceMethod( credentials, "addAll", new Object[]{result}, new Class[]{credentials.getClass()} ); 184 } 185 186 return (C) copiedConf; 187 } 188 189 protected static <C extends Configuration> C callCopyConstructor( Class type, Configuration parent ) 190 { 191 try 192 { 193 Constructor<C> constructor = type.getConstructor( parent.getClass() ); 194 195 return constructor.newInstance( parent ); 196 } 197 catch( NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException exception ) 198 { 199 throw new CascadingException( "unable to create copy of: " + type ); 200 } 201 } 202 203 public static <C extends Configuration> C copyConfiguration( Map<Object, Object> srcProperties, C dstConfiguration ) 204 { 205 Set<Object> keys = new HashSet<Object>( srcProperties.keySet() ); 206 207 // keys will only be grabbed if both key/value are String, so keep orig keys 208 if( srcProperties instanceof Properties ) 209 keys.addAll( ( (Properties) srcProperties ).stringPropertyNames() ); 210 211 for( Object key : keys ) 212 { 213 Object value = srcProperties.get( key ); 214 215 if( value == null && srcProperties instanceof Properties && key instanceof String ) 216 value = ( (Properties) srcProperties ).getProperty( (String) key ); 217 218 if( value == null ) // don't stuff null values 219 continue; 220 221 // don't let these objects pass, even though toString is called below. 222 if( value instanceof Class || value instanceof JobConf ) 223 continue; 224 225 dstConfiguration.set( key.toString(), value.toString() ); 226 } 227 228 return dstConfiguration; 229 } 230 231 public static Map<Object, Object> createProperties( Configuration jobConf ) 232 { 233 Map<Object, Object> properties = new HashMap<Object, Object>(); 234 235 if( jobConf == null ) 236 return properties; 237 238 for( Map.Entry<String, String> entry : jobConf ) 239 properties.put( entry.getKey(), entry.getValue() ); 240 241 return properties; 242 } 243 244 public static Thread getHDFSShutdownHook() 245 { 246 Exception caughtException; 247 248 try 249 { 250 // we must init the FS so the finalizer is registered 251 FileSystem.getLocal( new JobConf() ); 252 253 Field field = FileSystem.class.getDeclaredField( "clientFinalizer" ); 254 field.setAccessible( true ); 255 256 Thread finalizer = (Thread) field.get( null ); 257 258 if( finalizer != null ) 259 Runtime.getRuntime().removeShutdownHook( finalizer ); 260 261 return finalizer; 262 } 263 catch( NoSuchFieldException exception ) 264 { 265 caughtException = exception; 266 } 267 catch( IllegalAccessException exception ) 268 { 269 caughtException = exception; 270 } 271 catch( IOException exception ) 272 { 273 caughtException = exception; 274 } 275 276 LOG.debug( "unable to find and remove client hdfs shutdown hook, received exception: {}", caughtException.getClass().getName() ); 277 278 return null; 279 } 280 281 public static String encodeBytes( byte[] bytes ) 282 { 283 try 284 { 285 return new String( Base64.encodeBase64( bytes ), ENCODING ); 286 } 287 catch( UnsupportedEncodingException exception ) 288 { 289 throw new RuntimeException( exception ); 290 } 291 } 292 293 public static byte[] decodeBytes( String string ) 294 { 295 try 296 { 297 byte[] bytes = string.getBytes( ENCODING ); 298 return Base64.decodeBase64( bytes ); 299 } 300 catch( UnsupportedEncodingException exception ) 301 { 302 throw new RuntimeException( exception ); 303 } 304 } 305 306 public static <T> ObjectSerializer instantiateSerializer( Configuration conf, Class<T> type ) throws ClassNotFoundException 307 { 308 Class<ObjectSerializer> flowSerializerClass; 309 310 String serializerClassName = conf.get( ObjectSerializer.OBJECT_SERIALIZER_PROPERTY ); 311 312 if( serializerClassName == null || serializerClassName.length() == 0 ) 313 flowSerializerClass = (Class<ObjectSerializer>) DEFAULT_OBJECT_SERIALIZER; 314 else 315 flowSerializerClass = (Class<ObjectSerializer>) Class.forName( serializerClassName ); 316 317 ObjectSerializer objectSerializer; 318 319 try 320 { 321 objectSerializer = flowSerializerClass.newInstance(); 322 323 if( objectSerializer instanceof Configurable ) 324 ( (Configurable) objectSerializer ).setConf( conf ); 325 } 326 catch( Exception exception ) 327 { 328 exception.printStackTrace(); 329 throw new IllegalArgumentException( "Unable to instantiate serializer \"" 330 + flowSerializerClass.getName() 331 + "\" for class: " 332 + type.getName() ); 333 } 334 335 if( !objectSerializer.accepts( type ) ) 336 throw new IllegalArgumentException( serializerClassName + " won't accept objects of class " + type.toString() ); 337 338 return objectSerializer; 339 } 340 341 public static <T> String serializeBase64( T object, Configuration conf ) throws IOException 342 { 343 return serializeBase64( object, conf, true ); 344 } 345 346 public static <T> String serializeBase64( T object, Configuration conf, boolean compress ) throws IOException 347 { 348 ObjectSerializer objectSerializer; 349 350 try 351 { 352 objectSerializer = instantiateSerializer( conf, object.getClass() ); 353 } 354 catch( ClassNotFoundException exception ) 355 { 356 throw new IOException( exception ); 357 } 358 359 return encodeBytes( objectSerializer.serialize( object, compress ) ); 360 } 361 362 /** 363 * This method deserializes the Base64 encoded String into an Object instance. 364 * 365 * @param string 366 * @return an Object 367 */ 368 public static <T> T deserializeBase64( String string, Configuration conf, Class<T> type ) throws IOException 369 { 370 return deserializeBase64( string, conf, type, true ); 371 } 372 373 public static <T> T deserializeBase64( String string, Configuration conf, Class<T> type, boolean decompress ) throws IOException 374 { 375 if( string == null || string.length() == 0 ) 376 return null; 377 378 ObjectSerializer objectSerializer; 379 380 try 381 { 382 objectSerializer = instantiateSerializer( conf, type ); 383 } 384 catch( ClassNotFoundException exception ) 385 { 386 throw new IOException( exception ); 387 } 388 389 return objectSerializer.deserialize( decodeBytes( string ), type, decompress ); 390 } 391 392 public static Class findMainClass( Class defaultType ) 393 { 394 return Util.findMainClass( defaultType, "org.apache.hadoop" ); 395 } 396 397 public static Map<String, String> getConfig( Configuration defaultConf, Configuration updatedConf ) 398 { 399 Map<String, String> configs = new HashMap<String, String>(); 400 401 for( Map.Entry<String, String> entry : updatedConf ) 402 configs.put( entry.getKey(), entry.getValue() ); 403 404 for( Map.Entry<String, String> entry : defaultConf ) 405 { 406 if( entry.getValue() == null ) 407 continue; 408 409 String updatedValue = configs.get( entry.getKey() ); 410 411 // if both null, lets purge from map to save space 412 if( updatedValue == null && entry.getValue() == null ) 413 configs.remove( entry.getKey() ); 414 415 // if the values are the same, lets also purge from map to save space 416 if( updatedValue != null && updatedValue.equals( entry.getValue() ) ) 417 configs.remove( entry.getKey() ); 418 419 configs.remove( "mapred.working.dir" ); 420 configs.remove( "mapreduce.job.working.dir" ); // hadoop2 421 } 422 423 return configs; 424 } 425 426 public static JobConf[] getJobConfs( Configuration job, List<Map<String, String>> configs ) 427 { 428 JobConf[] jobConfs = new JobConf[ configs.size() ]; 429 430 for( int i = 0; i < jobConfs.length; i++ ) 431 jobConfs[ i ] = (JobConf) mergeConf( job, configs.get( i ), false ); 432 433 return jobConfs; 434 } 435 436 public static <J extends Configuration> J mergeConf( J job, Map<String, String> config, boolean directly ) 437 { 438 Configuration currentConf = directly ? job : ( job instanceof JobConf ? copyJobConf( (JobConf) job ) : new Configuration( job ) ); 439 440 for( String key : config.keySet() ) 441 { 442 if( LOG.isDebugEnabled() ) 443 LOG.debug( "merging key: {} value: {}", key, config.get( key ) ); 444 445 currentConf.set( key, config.get( key ) ); 446 } 447 448 return (J) currentConf; 449 } 450 451 public static Configuration removePropertiesFrom( Configuration jobConf, String... keys ) 452 { 453 Map<Object, Object> properties = createProperties( jobConf ); 454 455 for( String key : keys ) 456 properties.remove( key ); 457 458 return copyConfiguration( properties, new JobConf() ); 459 } 460 461 public static boolean removeStateFromDistCache( Configuration conf, String path ) throws IOException 462 { 463 return new Hfs( new TextLine(), path ).deleteResource( conf ); 464 } 465 466 public static PlatformInfo getPlatformInfo() 467 { 468 if( platformInfo == null ) 469 platformInfo = getPlatformInfoInternal( JobConf.class, "org/apache/hadoop", "Hadoop" ); 470 471 return platformInfo; 472 } 473 474 public static PlatformInfo getPlatformInfo( Class type, String attributePath, String platformName ) 475 { 476 if( platformInfo == null ) 477 platformInfo = getPlatformInfoInternal( type, attributePath, platformName ); 478 479 return platformInfo; 480 } 481 482 public static PlatformInfo createPlatformInfo( Class type, String attributePath, String platformName ) 483 { 484 return getPlatformInfoInternal( type, attributePath, platformName ); 485 } 486 487 private static PlatformInfo getPlatformInfoInternal( Class type, String attributePath, String platformName ) 488 { 489 URL url = type.getResource( type.getSimpleName() + ".class" ); 490 491 if( url == null || !url.toString().startsWith( "jar" ) ) 492 return new PlatformInfo( platformName, null, null ); 493 494 String path = url.toString(); 495 path = path.substring( 0, path.lastIndexOf( "!" ) + 1 ); 496 497 String manifestPath = path + "/META-INF/MANIFEST.MF"; 498 String parsedVersion = Util.findVersion( path.substring( 0, path.length() - 1 ) ); 499 500 Manifest manifest; 501 502 try 503 { 504 manifest = new Manifest( new URL( manifestPath ).openStream() ); 505 } 506 catch( IOException exception ) 507 { 508 LOG.warn( "unable to get manifest from {}: {}", manifestPath, exception.getMessage() ); 509 510 return new PlatformInfo( platformName, null, parsedVersion ); 511 } 512 513 Attributes attributes = manifest.getAttributes( attributePath ); 514 515 if( attributes == null ) 516 attributes = manifest.getMainAttributes(); 517 518 if( attributes == null ) 519 { 520 LOG.debug( "unable to get platform manifest attributes" ); 521 return new PlatformInfo( platformName, null, parsedVersion ); 522 } 523 524 String vendor = attributes.getValue( "Implementation-Vendor" ); 525 String version = attributes.getValue( "Implementation-Version" ); 526 527 if( Util.isEmpty( version ) ) 528 version = parsedVersion; 529 530 return new PlatformInfo( platformName, vendor, version ); 531 } 532 533 /** 534 * Copies paths from one local path to a remote path. If syncTimes is true, both modification and access time are 535 * changed to match the local 'from' path. 536 * <p> 537 * Returns a map of file-name to remote modification times if the remote time is different than the local time. 538 * 539 * @param config 540 * @param commonPaths 541 * @param syncTimes 542 */ 543 public static Map<String, Long> syncPaths( Configuration config, Map<Path, Path> commonPaths, boolean syncTimes ) 544 { 545 if( commonPaths == null ) 546 return Collections.emptyMap(); 547 548 Map<String, Long> timestampMap = new HashMap<>(); 549 550 Map<Path, Path> copyPaths = getCopyPaths( config, commonPaths ); // tests remote file existence or if stale 551 552 LocalFileSystem localFS = getLocalFS( config ); 553 FileSystem remoteFS = getDefaultFS( config ); 554 555 for( Map.Entry<Path, Path> entry : copyPaths.entrySet() ) 556 { 557 Path localPath = entry.getKey(); 558 Path remotePath = entry.getValue(); 559 560 try 561 { 562 LOG.info( "copying from: {}, to: {}", localPath, remotePath ); 563 remoteFS.copyFromLocalFile( localPath, remotePath ); 564 565 if( !syncTimes ) 566 { 567 timestampMap.put( remotePath.getName(), remoteFS.getFileStatus( remotePath ).getModificationTime() ); 568 continue; 569 } 570 } 571 catch( IOException exception ) 572 { 573 throw new FlowException( "unable to copy local: " + localPath + " to remote: " + remotePath, exception ); 574 } 575 576 FileStatus localFileStatus = null; 577 578 try 579 { 580 // sync the modified times so we can lazily upload jars to hdfs after job is started 581 // otherwise modified time will be local to hdfs 582 localFileStatus = localFS.getFileStatus( localPath ); 583 remoteFS.setTimes( remotePath, localFileStatus.getModificationTime(), -1 ); // don't set the access time 584 } 585 catch( IOException exception ) 586 { 587 LOG.info( "unable to set local modification time on remote file: {}, 'dfs.namenode.accesstime.precision' may be set to 0 on HDFS.", remotePath ); 588 589 if( localFileStatus != null ) 590 timestampMap.put( remotePath.getName(), localFileStatus.getModificationTime() ); 591 } 592 } 593 594 return timestampMap; 595 } 596 597 public static Map<Path, Path> getCommonPaths( Map<String, Path> localPaths, Map<String, Path> remotePaths ) 598 { 599 Map<Path, Path> commonPaths = new HashMap<Path, Path>(); 600 601 for( Map.Entry<String, Path> entry : localPaths.entrySet() ) 602 { 603 if( remotePaths.containsKey( entry.getKey() ) ) 604 commonPaths.put( entry.getValue(), remotePaths.get( entry.getKey() ) ); 605 } 606 607 return commonPaths; 608 } 609 610 private static Map<Path, Path> getCopyPaths( Configuration config, Map<Path, Path> commonPaths ) 611 { 612 Map<Path, Path> copyPaths = new HashMap<Path, Path>(); 613 614 FileSystem remoteFS = getDefaultFS( config ); 615 FileSystem localFS = getLocalFS( config ); 616 617 for( Map.Entry<Path, Path> entry : commonPaths.entrySet() ) 618 { 619 Path localPath = entry.getKey(); 620 Path remotePath = entry.getValue(); 621 622 try 623 { 624 boolean localExists = localFS.exists( localPath ); 625 boolean remoteExist = remoteFS.exists( remotePath ); 626 627 if( localExists && !remoteExist ) 628 { 629 copyPaths.put( localPath, remotePath ); 630 } 631 else if( localExists ) 632 { 633 long localModTime = localFS.getFileStatus( localPath ).getModificationTime(); 634 long remoteModTime = remoteFS.getFileStatus( remotePath ).getModificationTime(); 635 636 if( localModTime > remoteModTime ) 637 copyPaths.put( localPath, remotePath ); 638 } 639 } 640 catch( IOException exception ) 641 { 642 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 643 } 644 } 645 646 return copyPaths; 647 } 648 649 public static void resolvePaths( Configuration config, Collection<String> classpath, String remoteRoot, String resourceSubPath, Map<String, Path> localPaths, Map<String, Path> remotePaths ) 650 { 651 FileSystem defaultFS = getDefaultFS( config ); 652 FileSystem localFS = getLocalFS( config ); 653 654 Path remoteRootPath = new Path( remoteRoot == null ? "./.staging" : remoteRoot ); 655 656 if( resourceSubPath != null ) 657 remoteRootPath = new Path( remoteRootPath, resourceSubPath ); 658 659 remoteRootPath = defaultFS.makeQualified( remoteRootPath ); 660 661 boolean defaultIsLocal = defaultFS.equals( localFS ); 662 663 for( String stringPath : classpath ) 664 { 665 Path path = new Path( stringPath ); 666 667 URI uri = path.toUri(); 668 669 if( uri.getScheme() == null && !defaultIsLocal ) // we want to sync 670 { 671 Path localPath = localFS.makeQualified( path ); 672 673 if( !exists( localFS, localPath ) ) 674 throw new FlowException( "path not found: " + localPath ); 675 676 String name = localPath.getName(); 677 678 if( resourceSubPath != null ) 679 name = resourceSubPath + "/" + name; 680 681 localPaths.put( name, localPath ); 682 remotePaths.put( name, defaultFS.makeQualified( new Path( remoteRootPath, path.getName() ) ) ); 683 } 684 else if( localFS.equals( getFileSystem( config, path ) ) ) 685 { 686 if( !exists( localFS, path ) ) 687 throw new FlowException( "path not found: " + path ); 688 689 Path localPath = localFS.makeQualified( path ); 690 691 String name = localPath.getName(); 692 693 if( resourceSubPath != null ) 694 name = resourceSubPath + "/" + name; 695 696 localPaths.put( name, localPath ); 697 } 698 else 699 { 700 if( !exists( defaultFS, path ) ) 701 throw new FlowException( "path not found: " + path ); 702 703 Path defaultPath = defaultFS.makeQualified( path ); 704 705 String name = defaultPath.getName(); 706 707 if( resourceSubPath != null ) 708 name = resourceSubPath + "/" + name; 709 710 remotePaths.put( name, defaultPath ); 711 } 712 } 713 } 714 715 private static boolean exists( FileSystem fileSystem, Path path ) 716 { 717 try 718 { 719 return fileSystem.exists( path ); 720 } 721 catch( IOException exception ) 722 { 723 throw new FlowException( "could not test file exists: " + path ); 724 } 725 } 726 727 private static FileSystem getFileSystem( Configuration config, Path path ) 728 { 729 try 730 { 731 return path.getFileSystem( config ); 732 } 733 catch( IOException exception ) 734 { 735 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 736 } 737 } 738 739 public static LocalFileSystem getLocalFS( Configuration config ) 740 { 741 try 742 { 743 return FileSystem.getLocal( config ); 744 } 745 catch( IOException exception ) 746 { 747 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 748 } 749 } 750 751 public static FileSystem getDefaultFS( Configuration config ) 752 { 753 try 754 { 755 return FileSystem.get( config ); 756 } 757 catch( IOException exception ) 758 { 759 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 760 } 761 } 762 763 public static boolean isLocal( Configuration conf ) 764 { 765 // hadoop 1.0 and 2.0 use different properties to define local mode: we check the new YARN 766 // property first 767 String frameworkName = conf.get( "mapreduce.framework.name" ); 768 769 // we are running on hadoop 2.0 (YARN) 770 if( frameworkName != null ) 771 return frameworkName.equals( "local" ); 772 773 // for Tez 774 String tezLocal = conf.get( "tez.local.mode" ); 775 776 if( tezLocal != null ) 777 return tezLocal.equals( "true" ); 778 779 // hadoop 1.0: use the old property to determine the local mode 780 String hadoop1 = conf.get( "mapred.job.tracker" ); 781 782 if( hadoop1 == null ) 783 { 784 LOG.warn( "could not successfully test if Hadoop based platform is in standalone/local mode, no valid properties set, returning false - tests for: mapreduce.framework.name, tez.local.mode, and mapred.job.tracker" ); 785 return false; 786 } 787 788 return hadoop1.equals( "local" ); 789 } 790 791 public static boolean isYARN( Configuration conf ) 792 { 793 return conf.get( "mapreduce.framework.name" ) != null; 794 } 795 796 public static void setLocal( Configuration conf ) 797 { 798 // set both properties to local 799 conf.set( "mapred.job.tracker", "local" ); 800 801 // yarn 802 conf.set( "mapreduce.framework.name", "local" ); 803 804 // tez 805 conf.set( "tez.local.mode", "true" ); 806 conf.set( "tez.runtime.optimize.local.fetch", "true" ); 807 } 808 809 private static boolean interfaceAssignableFromClassName( Class<?> xface, String className ) 810 { 811 if( ( className == null ) || ( xface == null ) ) 812 return false; 813 814 try 815 { 816 Class<?> klass = Class.forName( className ); 817 if( klass == null ) 818 return false; 819 820 if( !xface.isAssignableFrom( klass ) ) 821 return false; 822 823 return true; 824 } 825 catch( ClassNotFoundException cnfe ) 826 { 827 return false; // let downstream figure it out 828 } 829 } 830 831 public static boolean setNewApi( Configuration conf, String className ) 832 { 833 if( className == null ) // silently return and let the error be caught downstream 834 return false; 835 836 boolean isStable = className.startsWith( "org.apache.hadoop.mapred." ) 837 || interfaceAssignableFromClassName( org.apache.hadoop.mapred.InputFormat.class, className ); 838 839 boolean isNew = className.startsWith( "org.apache.hadoop.mapreduce." ) 840 || interfaceAssignableFromClassName( org.apache.hadoop.mapreduce.InputFormat.class, className ); 841 842 if( isStable ) 843 conf.setBoolean( "mapred.mapper.new-api", false ); 844 else if( isNew ) 845 conf.setBoolean( "mapred.mapper.new-api", true ); 846 else 847 throw new IllegalStateException( "cannot determine if class denotes stable or new api, please set 'mapred.mapper.new-api' to the appropriate value" ); 848 849 return true; 850 } 851 852 public static void addInputPaths( Configuration conf, Iterable<Path> paths ) 853 { 854 Path workingDirectory = getWorkingDirectory( conf ); 855 String dirs = conf.get( "mapred.input.dir" ); 856 StringBuilder buffer = new StringBuilder( dirs == null ? "" : dirs ); 857 858 for( Path path : paths ) 859 { 860 if( !path.isAbsolute() ) 861 path = new Path( workingDirectory, path ); 862 863 String dirStr = StringUtils.escapeString( path.toString() ); 864 865 if( buffer.length() != 0 ) 866 buffer.append( ',' ); 867 868 buffer.append( dirStr ); 869 } 870 871 conf.set( "mapred.input.dir", buffer.toString() ); 872 } 873 874 public static void addInputPath( Configuration conf, Path path ) 875 { 876 Path workingDirectory = getWorkingDirectory( conf ); 877 path = new Path( workingDirectory, path ); 878 String dirStr = StringUtils.escapeString( path.toString() ); 879 String dirs = conf.get( "mapred.input.dir" ); 880 conf.set( "mapred.input.dir", dirs == null ? dirStr : 881 dirs + StringUtils.COMMA_STR + dirStr ); 882 } 883 884 public static void setOutputPath( Configuration conf, Path path ) 885 { 886 Path workingDirectory = getWorkingDirectory( conf ); 887 path = new Path( workingDirectory, path ); 888 conf.set( "mapred.output.dir", path.toString() ); 889 } 890 891 private static Path getWorkingDirectory( Configuration conf ) 892 { 893 String name = conf.get( "mapred.working.dir" ); 894 if( name != null ) 895 { 896 return new Path( name ); 897 } 898 else 899 { 900 try 901 { 902 Path dir = FileSystem.get( conf ).getWorkingDirectory(); 903 conf.set( "mapred.working.dir", dir.toString() ); 904 return dir; 905 } 906 catch( IOException e ) 907 { 908 throw new RuntimeException( e ); 909 } 910 } 911 } 912 913 public static Path getOutputPath( Configuration conf ) 914 { 915 String name = conf.get( "mapred.output.dir" ); 916 return name == null ? null : new Path( name ); 917 } 918 919 public static String pack( Object object, Configuration conf ) 920 { 921 if( object == null ) 922 return ""; 923 924 try 925 { 926 return serializeBase64( object, conf, true ); 927 } 928 catch( IOException exception ) 929 { 930 throw new FlowException( "unable to pack object: " + object.getClass().getCanonicalName(), exception ); 931 } 932 } 933 934 public static void addFields( Configuration conf, String property, Map<Integer, Fields> fields ) 935 { 936 if( fields == null || fields.isEmpty() ) 937 return; 938 939 Map<String, Fields> toPack = new HashMap<>(); 940 941 for( Map.Entry<Integer, Fields> entry : fields.entrySet() ) 942 toPack.put( entry.getKey().toString(), entry.getValue() ); 943 944 conf.set( property, pack( toPack, conf ) ); 945 } 946 947 public static Map<Integer, Fields> getFields( Configuration conf, String property ) throws IOException 948 { 949 String value = conf.getRaw( property ); 950 951 if( value == null || value.isEmpty() ) 952 return Collections.emptyMap(); 953 954 Map<String, Fields> map = deserializeBase64( value, conf, Map.class, true ); 955 Map<Integer, Fields> result = new HashMap<>(); 956 957 for( Map.Entry<String, Fields> entry : map.entrySet() ) 958 result.put( Integer.parseInt( entry.getKey() ), entry.getValue() ); 959 960 return result; 961 } 962 963 public static void addComparators( Configuration conf, String property, Map<String, Fields> map, BaseFlowStep flowStep, Group group ) 964 { 965 Iterator<Fields> fieldsIterator = map.values().iterator(); 966 967 if( !fieldsIterator.hasNext() ) 968 return; 969 970 Fields fields = fieldsIterator.next(); 971 972 if( fields.hasComparators() ) 973 { 974 conf.set( property, pack( fields, conf ) ); 975 return; 976 } 977 978 // use resolved fields if there are no comparators. 979 Set<Scope> previousScopes = flowStep.getPreviousScopes( group ); 980 981 fields = previousScopes.iterator().next().getOutValuesFields(); 982 983 if( fields.size() != 0 ) // allows fields.UNKNOWN to be used 984 conf.setInt( property + ".size", fields.size() ); 985 } 986 987 public static void addComparators( Configuration conf, String property, Map<String, Fields> map, Fields resolvedFields ) 988 { 989 Iterator<Fields> fieldsIterator = map.values().iterator(); 990 991 if( !fieldsIterator.hasNext() ) 992 return; 993 994 while( fieldsIterator.hasNext() ) 995 { 996 Fields fields = fieldsIterator.next(); 997 998 if( fields.hasComparators() ) 999 { 1000 conf.set( property, pack( fields, conf ) ); 1001 return; 1002 } 1003 } 1004 1005 if( resolvedFields.size() != 0 ) // allows fields.UNKNOWN to be used 1006 conf.setInt( property + ".size", resolvedFields.size() ); 1007 } 1008 }