001/* 002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.flow.hadoop.util; 023 024import java.io.IOException; 025import java.io.UnsupportedEncodingException; 026import java.lang.reflect.Constructor; 027import java.lang.reflect.Field; 028import java.lang.reflect.InvocationTargetException; 029import java.net.URI; 030import java.net.URL; 031import java.util.Collection; 032import java.util.Collections; 033import java.util.HashMap; 034import java.util.HashSet; 035import java.util.Iterator; 036import java.util.List; 037import java.util.Map; 038import java.util.Properties; 039import java.util.Set; 040import java.util.jar.Attributes; 041import java.util.jar.Manifest; 042 043import cascading.CascadingException; 044import cascading.flow.FlowException; 045import cascading.flow.planner.BaseFlowStep; 046import cascading.flow.planner.PlatformInfo; 047import cascading.flow.planner.Scope; 048import cascading.pipe.Group; 049import cascading.scheme.hadoop.TextLine; 050import cascading.tap.hadoop.Hfs; 051import cascading.tuple.Fields; 052import cascading.util.LogUtil; 053import cascading.util.Util; 054import org.apache.commons.codec.binary.Base64; 055import org.apache.hadoop.conf.Configurable; 056import org.apache.hadoop.conf.Configuration; 057import org.apache.hadoop.fs.FileStatus; 058import org.apache.hadoop.fs.FileSystem; 059import org.apache.hadoop.fs.LocalFileSystem; 060import org.apache.hadoop.fs.Path; 061import org.apache.hadoop.mapred.JobConf; 062import org.apache.hadoop.util.StringUtils; 063import org.slf4j.Logger; 064import org.slf4j.LoggerFactory; 065 066import static cascading.util.Util.invokeInstanceMethod; 067 068/** 069 * 070 */ 071public class HadoopUtil 072 { 073 public static final String CASCADING_FLOW_EXECUTING = "cascading.flow.executing"; 074 075 private static final Logger LOG = LoggerFactory.getLogger( HadoopUtil.class ); 076 private static final String ENCODING = "US-ASCII"; 077 private static final Class<?> DEFAULT_OBJECT_SERIALIZER = JavaObjectSerializer.class; 078 079 private static PlatformInfo platformInfo; 080 081 public static void setIsInflow( Configuration conf ) 082 { 083 conf.setBoolean( CASCADING_FLOW_EXECUTING, true ); 084 } 085 086 public static boolean isInflow( Configuration conf ) 087 { 088 return conf.getBoolean( CASCADING_FLOW_EXECUTING, false ); 089 } 090 091 public static void initLog4j( JobConf configuration ) 092 { 093 initLog4j( (Configuration) configuration ); 094 } 095 096 public static void initLog4j( Configuration configuration ) 097 { 098 String values = configuration.get( "log4j.logger", null ); 099 100 if( values == null || values.length() == 0 ) 101 return; 102 103 if( !Util.hasClass( "org.apache.log4j.Logger" ) ) 104 { 105 LOG.info( "org.apache.log4j.Logger is not in the current CLASSPATH, not setting log4j.logger properties" ); 106 return; 107 } 108 109 String[] elements = values.split( "," ); 110 111 for( String element : elements ) 112 LogUtil.setLog4jLevel( element.split( "=" ) ); 113 } 114 115 // only place JobConf should ever be returned 116 public static JobConf asJobConfInstance( Configuration configuration ) 117 { 118 if( configuration instanceof JobConf ) 119 return (JobConf) configuration; 120 121 return new JobConf( configuration ); 122 } 123 124 public static <C> C copyJobConf( C parentJobConf ) 125 { 126 return copyConfiguration( parentJobConf ); 127 } 128 129 public static JobConf copyJobConf( JobConf parentJobConf ) 130 { 131 if( parentJobConf == null ) 132 throw new IllegalArgumentException( "parent may not be null" ); 133 134 // see https://github.com/Cascading/cascading/pull/21 135 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in 136 // case those Credentials are mutated later on down the road (which they will be, during job submission, in 137 // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. 138 final Configuration configurationCopy = new Configuration( parentJobConf ); 139 final JobConf jobConf = new JobConf( configurationCopy ); 140 141 jobConf.getCredentials().addAll( parentJobConf.getCredentials() ); 142 143 return jobConf; 144 } 145 146 public static JobConf createJobConf( Map<Object, Object> properties ) 147 { 148 return createJobConf( properties, null ); 149 } 150 151 public static JobConf createJobConf( Map<Object, Object> properties, JobConf defaultJobconf ) 152 { 153 JobConf jobConf = defaultJobconf == null ? new JobConf() : copyJobConf( defaultJobconf ); 154 155 if( properties == null ) 156 return jobConf; 157 158 return copyConfiguration( properties, jobConf ); 159 } 160 161 public static <C> C copyConfiguration( C parent ) 162 { 163 if( parent == null ) 164 throw new IllegalArgumentException( "parent may not be null" ); 165 166 if( !( parent instanceof Configuration ) ) 167 throw new IllegalArgumentException( "parent must be of type Configuration" ); 168 169 Configuration conf = (Configuration) parent; 170 171 // see https://github.com/Cascading/cascading/pull/21 172 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in 173 // case those Credentials are mutated later on down the road (which they will be, during job submission, in 174 // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. 175 Configuration configurationCopy = new Configuration( conf ); 176 177 Configuration copiedConf = callCopyConstructor( parent.getClass(), configurationCopy ); 178 179 if( Util.hasInstanceMethod( parent, "getCredentials", null ) ) 180 { 181 Object result = invokeInstanceMethod( parent, "getCredentials", null, null ); 182 Object credentials = invokeInstanceMethod( copiedConf, "getCredentials", null, null ); 183 184 invokeInstanceMethod( credentials, "addAll", new Object[]{result}, new Class[]{credentials.getClass()} ); 185 } 186 187 return (C) copiedConf; 188 } 189 190 protected static <C extends Configuration> C callCopyConstructor( Class type, Configuration parent ) 191 { 192 try 193 { 194 Constructor<C> constructor = type.getConstructor( parent.getClass() ); 195 196 return constructor.newInstance( parent ); 197 } 198 catch( NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException exception ) 199 { 200 throw new CascadingException( "unable to create copy of: " + type ); 201 } 202 } 203 204 public static <C extends Configuration> C copyConfiguration( Map<Object, Object> srcProperties, C dstConfiguration ) 205 { 206 Set<Object> keys = new HashSet<Object>( srcProperties.keySet() ); 207 208 // keys will only be grabbed if both key/value are String, so keep orig keys 209 if( srcProperties instanceof Properties ) 210 keys.addAll( ( (Properties) srcProperties ).stringPropertyNames() ); 211 212 for( Object key : keys ) 213 { 214 Object value = srcProperties.get( key ); 215 216 if( value == null && srcProperties instanceof Properties && key instanceof String ) 217 value = ( (Properties) srcProperties ).getProperty( (String) key ); 218 219 if( value == null ) // don't stuff null values 220 continue; 221 222 // don't let these objects pass, even though toString is called below. 223 if( value instanceof Class || value instanceof JobConf ) 224 continue; 225 226 dstConfiguration.set( key.toString(), value.toString() ); 227 } 228 229 return dstConfiguration; 230 } 231 232 public static Map<Object, Object> createProperties( Configuration jobConf ) 233 { 234 Map<Object, Object> properties = new HashMap<Object, Object>(); 235 236 if( jobConf == null ) 237 return properties; 238 239 for( Map.Entry<String, String> entry : jobConf ) 240 properties.put( entry.getKey(), entry.getValue() ); 241 242 return properties; 243 } 244 245 public static Thread getHDFSShutdownHook() 246 { 247 Exception caughtException; 248 249 try 250 { 251 // we must init the FS so the finalizer is registered 252 FileSystem.getLocal( new JobConf() ); 253 254 Field field = FileSystem.class.getDeclaredField( "clientFinalizer" ); 255 field.setAccessible( true ); 256 257 Thread finalizer = (Thread) field.get( null ); 258 259 if( finalizer != null ) 260 Runtime.getRuntime().removeShutdownHook( finalizer ); 261 262 return finalizer; 263 } 264 catch( NoSuchFieldException exception ) 265 { 266 caughtException = exception; 267 } 268 catch( IllegalAccessException exception ) 269 { 270 caughtException = exception; 271 } 272 catch( IOException exception ) 273 { 274 caughtException = exception; 275 } 276 277 LOG.debug( "unable to find and remove client hdfs shutdown hook, received exception: {}", caughtException.getClass().getName() ); 278 279 return null; 280 } 281 282 public static String encodeBytes( byte[] bytes ) 283 { 284 try 285 { 286 return new String( Base64.encodeBase64( bytes ), ENCODING ); 287 } 288 catch( UnsupportedEncodingException exception ) 289 { 290 throw new RuntimeException( exception ); 291 } 292 } 293 294 public static byte[] decodeBytes( String string ) 295 { 296 try 297 { 298 byte[] bytes = string.getBytes( ENCODING ); 299 return Base64.decodeBase64( bytes ); 300 } 301 catch( UnsupportedEncodingException exception ) 302 { 303 throw new RuntimeException( exception ); 304 } 305 } 306 307 public static <T> ObjectSerializer instantiateSerializer( Configuration conf, Class<T> type ) throws ClassNotFoundException 308 { 309 Class<ObjectSerializer> flowSerializerClass; 310 311 String serializerClassName = conf.get( ObjectSerializer.OBJECT_SERIALIZER_PROPERTY ); 312 313 if( serializerClassName == null || serializerClassName.length() == 0 ) 314 flowSerializerClass = (Class<ObjectSerializer>) DEFAULT_OBJECT_SERIALIZER; 315 else 316 flowSerializerClass = (Class<ObjectSerializer>) Class.forName( serializerClassName ); 317 318 ObjectSerializer objectSerializer; 319 320 try 321 { 322 objectSerializer = flowSerializerClass.newInstance(); 323 324 if( objectSerializer instanceof Configurable ) 325 ( (Configurable) objectSerializer ).setConf( conf ); 326 } 327 catch( Exception exception ) 328 { 329 exception.printStackTrace(); 330 throw new IllegalArgumentException( "Unable to instantiate serializer \"" 331 + flowSerializerClass.getName() 332 + "\" for class: " 333 + type.getName() ); 334 } 335 336 if( !objectSerializer.accepts( type ) ) 337 throw new IllegalArgumentException( serializerClassName + " won't accept objects of class " + type.toString() ); 338 339 return objectSerializer; 340 } 341 342 public static <T> String serializeBase64( T object, Configuration conf ) throws IOException 343 { 344 return serializeBase64( object, conf, true ); 345 } 346 347 public static <T> String serializeBase64( T object, Configuration conf, boolean compress ) throws IOException 348 { 349 ObjectSerializer objectSerializer; 350 351 try 352 { 353 objectSerializer = instantiateSerializer( conf, object.getClass() ); 354 } 355 catch( ClassNotFoundException exception ) 356 { 357 throw new IOException( exception ); 358 } 359 360 return encodeBytes( objectSerializer.serialize( object, compress ) ); 361 } 362 363 /** 364 * This method deserializes the Base64 encoded String into an Object instance. 365 * 366 * @param string 367 * @return an Object 368 */ 369 public static <T> T deserializeBase64( String string, Configuration conf, Class<T> type ) throws IOException 370 { 371 return deserializeBase64( string, conf, type, true ); 372 } 373 374 public static <T> T deserializeBase64( String string, Configuration conf, Class<T> type, boolean decompress ) throws IOException 375 { 376 if( string == null || string.length() == 0 ) 377 return null; 378 379 ObjectSerializer objectSerializer; 380 381 try 382 { 383 objectSerializer = instantiateSerializer( conf, type ); 384 } 385 catch( ClassNotFoundException exception ) 386 { 387 throw new IOException( exception ); 388 } 389 390 return objectSerializer.deserialize( decodeBytes( string ), type, decompress ); 391 } 392 393 public static Class findMainClass( Class defaultType ) 394 { 395 return Util.findMainClass( defaultType, "org.apache.hadoop" ); 396 } 397 398 public static Map<String, String> getConfig( Configuration defaultConf, Configuration updatedConf ) 399 { 400 Map<String, String> configs = new HashMap<String, String>(); 401 402 for( Map.Entry<String, String> entry : updatedConf ) 403 configs.put( entry.getKey(), entry.getValue() ); 404 405 for( Map.Entry<String, String> entry : defaultConf ) 406 { 407 if( entry.getValue() == null ) 408 continue; 409 410 String updatedValue = configs.get( entry.getKey() ); 411 412 // if both null, lets purge from map to save space 413 if( updatedValue == null && entry.getValue() == null ) 414 configs.remove( entry.getKey() ); 415 416 // if the values are the same, lets also purge from map to save space 417 if( updatedValue != null && updatedValue.equals( entry.getValue() ) ) 418 configs.remove( entry.getKey() ); 419 420 configs.remove( "mapred.working.dir" ); 421 configs.remove( "mapreduce.job.working.dir" ); // hadoop2 422 } 423 424 return configs; 425 } 426 427 public static JobConf[] getJobConfs( Configuration job, List<Map<String, String>> configs ) 428 { 429 JobConf[] jobConfs = new JobConf[ configs.size() ]; 430 431 for( int i = 0; i < jobConfs.length; i++ ) 432 jobConfs[ i ] = (JobConf) mergeConf( job, configs.get( i ), false ); 433 434 return jobConfs; 435 } 436 437 public static <J extends Configuration> J mergeConf( J job, Map<String, String> config, boolean directly ) 438 { 439 Configuration currentConf = directly ? job : ( job instanceof JobConf ? copyJobConf( (JobConf) job ) : new Configuration( job ) ); 440 441 for( String key : config.keySet() ) 442 { 443 if( LOG.isDebugEnabled() ) 444 LOG.debug( "merging key: {} value: {}", key, config.get( key ) ); 445 446 currentConf.set( key, config.get( key ) ); 447 } 448 449 return (J) currentConf; 450 } 451 452 public static Configuration removePropertiesFrom( Configuration jobConf, String... keys ) 453 { 454 Map<Object, Object> properties = createProperties( jobConf ); 455 456 for( String key : keys ) 457 properties.remove( key ); 458 459 return copyConfiguration( properties, new JobConf() ); 460 } 461 462 public static boolean removeStateFromDistCache( Configuration conf, String path ) throws IOException 463 { 464 return new Hfs( new TextLine(), path ).deleteResource( conf ); 465 } 466 467 public static PlatformInfo getPlatformInfo() 468 { 469 if( platformInfo == null ) 470 platformInfo = getPlatformInfoInternal( JobConf.class, "org/apache/hadoop", "Hadoop" ); 471 472 return platformInfo; 473 } 474 475 public static PlatformInfo getPlatformInfo( Class type, String attributePath, String platformName ) 476 { 477 if( platformInfo == null ) 478 platformInfo = getPlatformInfoInternal( type, attributePath, platformName ); 479 480 return platformInfo; 481 } 482 483 public static PlatformInfo createPlatformInfo( Class type, String attributePath, String platformName ) 484 { 485 return getPlatformInfoInternal( type, attributePath, platformName ); 486 } 487 488 private static PlatformInfo getPlatformInfoInternal( Class type, String attributePath, String platformName ) 489 { 490 URL url = type.getResource( type.getSimpleName() + ".class" ); 491 492 if( url == null || !url.toString().startsWith( "jar" ) ) 493 return new PlatformInfo( platformName, null, null ); 494 495 String path = url.toString(); 496 path = path.substring( 0, path.lastIndexOf( "!" ) + 1 ); 497 498 String manifestPath = path + "/META-INF/MANIFEST.MF"; 499 String parsedVersion = Util.findVersion( path.substring( 0, path.length() - 1 ) ); 500 501 Manifest manifest; 502 503 try 504 { 505 manifest = new Manifest( new URL( manifestPath ).openStream() ); 506 } 507 catch( IOException exception ) 508 { 509 LOG.warn( "unable to get manifest from {}: {}", manifestPath, exception.getMessage() ); 510 511 return new PlatformInfo( platformName, null, parsedVersion ); 512 } 513 514 Attributes attributes = manifest.getAttributes( attributePath ); 515 516 if( attributes == null ) 517 attributes = manifest.getMainAttributes(); 518 519 if( attributes == null ) 520 { 521 LOG.debug( "unable to get platform manifest attributes" ); 522 return new PlatformInfo( platformName, null, parsedVersion ); 523 } 524 525 String vendor = attributes.getValue( "Implementation-Vendor" ); 526 String version = attributes.getValue( "Implementation-Version" ); 527 528 if( Util.isEmpty( version ) ) 529 version = parsedVersion; 530 531 return new PlatformInfo( platformName, vendor, version ); 532 } 533 534 /** 535 * Copies paths from one local path to a remote path. If syncTimes is true, both modification and access time are 536 * changed to match the local 'from' path. 537 * <p> 538 * Returns a map of file-name to remote modification times if the remote time is different than the local time. 539 * 540 * @param config 541 * @param commonPaths 542 * @param syncTimes 543 */ 544 public static Map<String, Long> syncPaths( Configuration config, Map<Path, Path> commonPaths, boolean syncTimes ) 545 { 546 if( commonPaths == null ) 547 return Collections.emptyMap(); 548 549 Map<String, Long> timestampMap = new HashMap<>(); 550 551 Map<Path, Path> copyPaths = getCopyPaths( config, commonPaths ); // tests remote file existence or if stale 552 553 LocalFileSystem localFS = getLocalFS( config ); 554 FileSystem remoteFS = getDefaultFS( config ); 555 556 for( Map.Entry<Path, Path> entry : copyPaths.entrySet() ) 557 { 558 Path localPath = entry.getKey(); 559 Path remotePath = entry.getValue(); 560 561 try 562 { 563 LOG.info( "copying from: {}, to: {}", localPath, remotePath ); 564 remoteFS.copyFromLocalFile( localPath, remotePath ); 565 566 if( !syncTimes ) 567 { 568 timestampMap.put( remotePath.getName(), remoteFS.getFileStatus( remotePath ).getModificationTime() ); 569 continue; 570 } 571 } 572 catch( IOException exception ) 573 { 574 throw new FlowException( "unable to copy local: " + localPath + " to remote: " + remotePath, exception ); 575 } 576 577 FileStatus localFileStatus = null; 578 579 try 580 { 581 // sync the modified times so we can lazily upload jars to hdfs after job is started 582 // otherwise modified time will be local to hdfs 583 localFileStatus = localFS.getFileStatus( localPath ); 584 remoteFS.setTimes( remotePath, localFileStatus.getModificationTime(), -1 ); // don't set the access time 585 } 586 catch( IOException exception ) 587 { 588 LOG.info( "unable to set local modification time on remote file: {}, 'dfs.namenode.accesstime.precision' may be set to 0 on HDFS.", remotePath ); 589 590 if( localFileStatus != null ) 591 timestampMap.put( remotePath.getName(), localFileStatus.getModificationTime() ); 592 } 593 } 594 595 return timestampMap; 596 } 597 598 public static Map<Path, Path> getCommonPaths( Map<String, Path> localPaths, Map<String, Path> remotePaths ) 599 { 600 Map<Path, Path> commonPaths = new HashMap<Path, Path>(); 601 602 for( Map.Entry<String, Path> entry : localPaths.entrySet() ) 603 { 604 if( remotePaths.containsKey( entry.getKey() ) ) 605 commonPaths.put( entry.getValue(), remotePaths.get( entry.getKey() ) ); 606 } 607 608 return commonPaths; 609 } 610 611 private static Map<Path, Path> getCopyPaths( Configuration config, Map<Path, Path> commonPaths ) 612 { 613 Map<Path, Path> copyPaths = new HashMap<Path, Path>(); 614 615 FileSystem remoteFS = getDefaultFS( config ); 616 FileSystem localFS = getLocalFS( config ); 617 618 for( Map.Entry<Path, Path> entry : commonPaths.entrySet() ) 619 { 620 Path localPath = entry.getKey(); 621 Path remotePath = entry.getValue(); 622 623 try 624 { 625 boolean localExists = localFS.exists( localPath ); 626 boolean remoteExist = remoteFS.exists( remotePath ); 627 628 if( localExists && !remoteExist ) 629 { 630 copyPaths.put( localPath, remotePath ); 631 } 632 else if( localExists ) 633 { 634 long localModTime = localFS.getFileStatus( localPath ).getModificationTime(); 635 long remoteModTime = remoteFS.getFileStatus( remotePath ).getModificationTime(); 636 637 if( localModTime > remoteModTime ) 638 copyPaths.put( localPath, remotePath ); 639 } 640 } 641 catch( IOException exception ) 642 { 643 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 644 } 645 } 646 647 return copyPaths; 648 } 649 650 public static void resolvePaths( Configuration config, Collection<String> classpath, String remoteRoot, String resourceSubPath, Map<String, Path> localPaths, Map<String, Path> remotePaths ) 651 { 652 FileSystem defaultFS = getDefaultFS( config ); 653 FileSystem localFS = getLocalFS( config ); 654 655 Path remoteRootPath = new Path( remoteRoot == null ? "./.staging" : remoteRoot ); 656 657 if( resourceSubPath != null ) 658 remoteRootPath = new Path( remoteRootPath, resourceSubPath ); 659 660 remoteRootPath = defaultFS.makeQualified( remoteRootPath ); 661 662 boolean defaultIsLocal = defaultFS.equals( localFS ); 663 664 for( String stringPath : classpath ) 665 { 666 Path path = new Path( stringPath ); 667 668 URI uri = path.toUri(); 669 670 if( uri.getScheme() == null && !defaultIsLocal ) // we want to sync 671 { 672 Path localPath = localFS.makeQualified( path ); 673 674 if( !exists( localFS, localPath ) ) 675 throw new FlowException( "path not found: " + localPath ); 676 677 String name = localPath.getName(); 678 679 if( resourceSubPath != null ) 680 name = resourceSubPath + "/" + name; 681 682 localPaths.put( name, localPath ); 683 remotePaths.put( name, defaultFS.makeQualified( new Path( remoteRootPath, path.getName() ) ) ); 684 } 685 else if( localFS.equals( getFileSystem( config, path ) ) ) 686 { 687 if( !exists( localFS, path ) ) 688 throw new FlowException( "path not found: " + path ); 689 690 Path localPath = localFS.makeQualified( path ); 691 692 String name = localPath.getName(); 693 694 if( resourceSubPath != null ) 695 name = resourceSubPath + "/" + name; 696 697 localPaths.put( name, localPath ); 698 } 699 else 700 { 701 if( !exists( defaultFS, path ) ) 702 throw new FlowException( "path not found: " + path ); 703 704 Path defaultPath = defaultFS.makeQualified( path ); 705 706 String name = defaultPath.getName(); 707 708 if( resourceSubPath != null ) 709 name = resourceSubPath + "/" + name; 710 711 remotePaths.put( name, defaultPath ); 712 } 713 } 714 } 715 716 private static boolean exists( FileSystem fileSystem, Path path ) 717 { 718 try 719 { 720 return fileSystem.exists( path ); 721 } 722 catch( IOException exception ) 723 { 724 throw new FlowException( "could not test file exists: " + path ); 725 } 726 } 727 728 private static FileSystem getFileSystem( Configuration config, Path path ) 729 { 730 try 731 { 732 return path.getFileSystem( config ); 733 } 734 catch( IOException exception ) 735 { 736 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 737 } 738 } 739 740 public static LocalFileSystem getLocalFS( Configuration config ) 741 { 742 try 743 { 744 return FileSystem.getLocal( config ); 745 } 746 catch( IOException exception ) 747 { 748 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 749 } 750 } 751 752 public static FileSystem getDefaultFS( Configuration config ) 753 { 754 try 755 { 756 return FileSystem.get( config ); 757 } 758 catch( IOException exception ) 759 { 760 throw new FlowException( "unable to get handle to underlying filesystem", exception ); 761 } 762 } 763 764 public static boolean isLocal( Configuration conf ) 765 { 766 // hadoop 1.0 and 2.0 use different properties to define local mode: we check the new YARN 767 // property first 768 String frameworkName = conf.get( "mapreduce.framework.name" ); 769 770 // we are running on hadoop 2.0 (YARN) 771 if( frameworkName != null ) 772 return frameworkName.equals( "local" ); 773 774 // for Tez 775 String tezLocal = conf.get( "tez.local.mode" ); 776 777 if( tezLocal != null ) 778 return tezLocal.equals( "true" ); 779 780 // hadoop 1.0: use the old property to determine the local mode 781 String hadoop1 = conf.get( "mapred.job.tracker" ); 782 783 if( hadoop1 == null ) 784 { 785 LOG.warn( "could not successfully test if Hadoop based platform is in standalone/local mode, no valid properties set, returning false - tests for: mapreduce.framework.name, tez.local.mode, and mapred.job.tracker" ); 786 return false; 787 } 788 789 return hadoop1.equals( "local" ); 790 } 791 792 public static boolean isYARN( Configuration conf ) 793 { 794 return conf.get( "mapreduce.framework.name" ) != null; 795 } 796 797 public static void setLocal( Configuration conf ) 798 { 799 // set both properties to local 800 conf.set( "mapred.job.tracker", "local" ); 801 802 // yarn 803 conf.set( "mapreduce.framework.name", "local" ); 804 805 // tez 806 conf.set( "tez.local.mode", "true" ); 807 conf.set( "tez.runtime.optimize.local.fetch", "true" ); 808 } 809 810 private static boolean interfaceAssignableFromClassName( Class<?> xface, String className ) 811 { 812 if( ( className == null ) || ( xface == null ) ) 813 return false; 814 815 try 816 { 817 Class<?> klass = Class.forName( className ); 818 if( klass == null ) 819 return false; 820 821 if( !xface.isAssignableFrom( klass ) ) 822 return false; 823 824 return true; 825 } 826 catch( ClassNotFoundException cnfe ) 827 { 828 return false; // let downstream figure it out 829 } 830 } 831 832 public static boolean setNewApi( Configuration conf, String className ) 833 { 834 if( className == null ) // silently return and let the error be caught downstream 835 return false; 836 837 boolean isStable = className.startsWith( "org.apache.hadoop.mapred." ) 838 || interfaceAssignableFromClassName( org.apache.hadoop.mapred.InputFormat.class, className ); 839 840 boolean isNew = className.startsWith( "org.apache.hadoop.mapreduce." ) 841 || interfaceAssignableFromClassName( org.apache.hadoop.mapreduce.InputFormat.class, className ); 842 843 if( isStable ) 844 conf.setBoolean( "mapred.mapper.new-api", false ); 845 else if( isNew ) 846 conf.setBoolean( "mapred.mapper.new-api", true ); 847 else 848 throw new IllegalStateException( "cannot determine if class denotes stable or new api, please set 'mapred.mapper.new-api' to the appropriate value" ); 849 850 return true; 851 } 852 853 public static void addInputPaths( Configuration conf, Iterable<Path> paths ) 854 { 855 Path workingDirectory = getWorkingDirectory( conf ); 856 String dirs = conf.get( "mapred.input.dir" ); 857 StringBuilder buffer = new StringBuilder( dirs == null ? "" : dirs ); 858 859 for( Path path : paths ) 860 { 861 if( !path.isAbsolute() ) 862 path = new Path( workingDirectory, path ); 863 864 String dirStr = StringUtils.escapeString( path.toString() ); 865 866 if( buffer.length() != 0 ) 867 buffer.append( ',' ); 868 869 buffer.append( dirStr ); 870 } 871 872 conf.set( "mapred.input.dir", buffer.toString() ); 873 } 874 875 public static void addInputPath( Configuration conf, Path path ) 876 { 877 Path workingDirectory = getWorkingDirectory( conf ); 878 path = new Path( workingDirectory, path ); 879 String dirStr = StringUtils.escapeString( path.toString() ); 880 String dirs = conf.get( "mapred.input.dir" ); 881 conf.set( "mapred.input.dir", dirs == null ? dirStr : 882 dirs + StringUtils.COMMA_STR + dirStr ); 883 } 884 885 public static void setOutputPath( Configuration conf, Path path ) 886 { 887 Path workingDirectory = getWorkingDirectory( conf ); 888 path = new Path( workingDirectory, path ); 889 conf.set( "mapred.output.dir", path.toString() ); 890 } 891 892 private static Path getWorkingDirectory( Configuration conf ) 893 { 894 String name = conf.get( "mapred.working.dir" ); 895 if( name != null ) 896 { 897 return new Path( name ); 898 } 899 else 900 { 901 try 902 { 903 Path dir = FileSystem.get( conf ).getWorkingDirectory(); 904 conf.set( "mapred.working.dir", dir.toString() ); 905 return dir; 906 } 907 catch( IOException e ) 908 { 909 throw new RuntimeException( e ); 910 } 911 } 912 } 913 914 public static Path getOutputPath( Configuration conf ) 915 { 916 String name = conf.get( "mapred.output.dir" ); 917 return name == null ? null : new Path( name ); 918 } 919 920 public static String pack( Object object, Configuration conf ) 921 { 922 if( object == null ) 923 return ""; 924 925 try 926 { 927 return serializeBase64( object, conf, true ); 928 } 929 catch( IOException exception ) 930 { 931 throw new FlowException( "unable to pack object: " + object.getClass().getCanonicalName(), exception ); 932 } 933 } 934 935 public static void addFields( Configuration conf, String property, Map<Integer, Fields> fields ) 936 { 937 if( fields == null || fields.isEmpty() ) 938 return; 939 940 Map<String, Fields> toPack = new HashMap<>(); 941 942 for( Map.Entry<Integer, Fields> entry : fields.entrySet() ) 943 toPack.put( entry.getKey().toString(), entry.getValue() ); 944 945 conf.set( property, pack( toPack, conf ) ); 946 } 947 948 public static Map<Integer, Fields> getFields( Configuration conf, String property ) throws IOException 949 { 950 String value = conf.getRaw( property ); 951 952 if( value == null || value.isEmpty() ) 953 return Collections.emptyMap(); 954 955 Map<String, Fields> map = deserializeBase64( value, conf, Map.class, true ); 956 Map<Integer, Fields> result = new HashMap<>(); 957 958 for( Map.Entry<String, Fields> entry : map.entrySet() ) 959 result.put( Integer.parseInt( entry.getKey() ), entry.getValue() ); 960 961 return result; 962 } 963 964 public static void addComparators( Configuration conf, String property, Map<String, Fields> map, BaseFlowStep flowStep, Group group ) 965 { 966 Iterator<Fields> fieldsIterator = map.values().iterator(); 967 968 if( !fieldsIterator.hasNext() ) 969 return; 970 971 Fields fields = fieldsIterator.next(); 972 973 if( fields.hasComparators() ) 974 { 975 conf.set( property, pack( fields, conf ) ); 976 return; 977 } 978 979 // use resolved fields if there are no comparators. 980 Set<Scope> previousScopes = flowStep.getPreviousScopes( group ); 981 982 fields = previousScopes.iterator().next().getOutValuesFields(); 983 984 if( fields.size() != 0 ) // allows fields.UNKNOWN to be used 985 conf.setInt( property + ".size", fields.size() ); 986 } 987 988 public static void addComparators( Configuration conf, String property, Map<String, Fields> map, Fields resolvedFields ) 989 { 990 Iterator<Fields> fieldsIterator = map.values().iterator(); 991 992 if( !fieldsIterator.hasNext() ) 993 return; 994 995 while( fieldsIterator.hasNext() ) 996 { 997 Fields fields = fieldsIterator.next(); 998 999 if( fields.hasComparators() ) 1000 { 1001 conf.set( property, pack( fields, conf ) ); 1002 return; 1003 } 1004 } 1005 1006 if( resolvedFields.size() != 0 ) // allows fields.UNKNOWN to be used 1007 conf.setInt( property + ".size", resolvedFields.size() ); 1008 } 1009 }