001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.tuple.hadoop; 022 023import java.io.DataInputStream; 024import java.io.DataOutputStream; 025import java.io.IOException; 026import java.util.ArrayList; 027import java.util.Collection; 028import java.util.Collections; 029import java.util.Comparator; 030import java.util.HashMap; 031import java.util.LinkedList; 032import java.util.List; 033import java.util.Map; 034 035import cascading.CascadingException; 036import cascading.flow.FlowProcess; 037import cascading.flow.FlowProps; 038import cascading.flow.hadoop.util.HadoopUtil; 039import cascading.tuple.Comparison; 040import cascading.tuple.Fields; 041import cascading.tuple.Tuple; 042import cascading.tuple.TupleException; 043import cascading.tuple.hadoop.io.HadoopTupleOutputStream; 044import cascading.tuple.hadoop.io.IndexTupleDeserializer; 045import cascading.tuple.hadoop.io.IndexTupleSerializer; 046import cascading.tuple.hadoop.io.KeyIndexTupleDeserializer; 047import cascading.tuple.hadoop.io.KeyIndexTupleSerializer; 048import cascading.tuple.hadoop.io.KeyTupleDeserializer; 049import cascading.tuple.hadoop.io.KeyTupleSerializer; 050import cascading.tuple.hadoop.io.TupleDeserializer; 051import cascading.tuple.hadoop.io.TuplePairDeserializer; 052import cascading.tuple.hadoop.io.TuplePairSerializer; 053import cascading.tuple.hadoop.io.TupleSerializer; 054import cascading.tuple.hadoop.io.ValueIndexTupleDeserializer; 055import cascading.tuple.hadoop.io.ValueIndexTupleSerializer; 056import cascading.tuple.hadoop.io.ValueTupleDeserializer; 057import cascading.tuple.hadoop.io.ValueTupleSerializer; 058import cascading.tuple.io.IndexTuple; 059import cascading.tuple.io.KeyIndexTuple; 060import cascading.tuple.io.KeyTuple; 061import cascading.tuple.io.TupleInputStream; 062import cascading.tuple.io.TupleOutputStream; 063import cascading.tuple.io.TuplePair; 064import cascading.tuple.io.ValueIndexTuple; 065import cascading.tuple.io.ValueTuple; 066import cascading.util.Util; 067import org.apache.hadoop.conf.Configuration; 068import org.apache.hadoop.conf.Configured; 069import org.apache.hadoop.io.WritableUtils; 070import org.apache.hadoop.io.serializer.Deserializer; 071import org.apache.hadoop.io.serializer.Serialization; 072import org.apache.hadoop.io.serializer.SerializationFactory; 073import org.apache.hadoop.io.serializer.Serializer; 074import org.apache.hadoop.io.serializer.WritableSerialization; 075import org.apache.hadoop.util.ReflectionUtils; 076import org.slf4j.Logger; 077import org.slf4j.LoggerFactory; 078 079import static cascading.tuple.hadoop.TupleSerializationProps.HADOOP_IO_SERIALIZATIONS; 080 081/** 082 * Class TupleSerialization is an implementation of Hadoop's {@link Serialization} interface. 083 * <p> 084 * Typically developers will not use this implementation directly as it is automatically added 085 * to any relevant MapReduce jobs via the {@link org.apache.hadoop.conf.Configuration}. 086 * <p> 087 * By default, all primitive types are natively handled, and {@link org.apache.hadoop.io.BytesWritable} 088 * has a pre-configured serialization token since byte arrays are not handled natively by {@link Tuple}. 089 * <p> 090 * To add or manipulate Hadoop serializations or Cascading serializations tokens, see 091 * {@link TupleSerializationProps} for a fluent property builder class. 092 * <p> 093 * By default this Serialization interface registers the class {@link org.apache.hadoop.io.ByteWritable} as 094 * token 127. 095 */ 096@SerializationToken( 097 tokens = {127}, 098 classNames = {"org.apache.hadoop.io.BytesWritable"}) 099public class TupleSerialization extends Configured implements Serialization 100 { 101 102 /** Field LOG */ 103 private static final Logger LOG = LoggerFactory.getLogger( TupleSerialization.class ); 104 105 /** Field defaultComparator * */ 106 private Comparator defaultComparator; 107 /** Field classCache */ 108 private final Map<String, Class> classCache = new HashMap<String, Class>(); 109 /** Field serializationFactory */ 110 private SerializationFactory serializationFactory; 111 112 /** Field tokenClassesMap */ 113 private HashMap<Integer, String> tokenClassesMap; 114 /** Field classesTokensMap */ 115 private HashMap<String, Integer> classesTokensMap; 116 /** Field tokenMapSize */ 117 private long tokensSize = 0; 118 119 List<Integer> ordinals; 120 121 Map<Integer, Fields> keyFieldsMap; 122 Map<Integer, Fields> sortFieldsMap; 123 Map<Integer, Fields> valueFieldsMap; 124 125 Fields keyFields; 126 Fields sortFields; 127 Fields valueFields; 128 129 Boolean typesRequired; // for testing purposes 130 Boolean typesIgnored; // for testing purposes 131 132 static String getSerializationTokens( Configuration jobConf ) 133 { 134 return jobConf.get( TupleSerializationProps.SERIALIZATION_TOKENS ); 135 } 136 137 /** 138 * Adds this class as a Hadoop Serialization class. This method is safe to call redundantly. 139 * <p> 140 * This method will guarantee and {@link WritableSerialization} are 141 * first in the list, as both are required. 142 * 143 * @param jobConf of type JobConf 144 */ 145 public static void setSerializations( Configuration jobConf ) 146 { 147 setSerializations( jobConf, Collections.emptySet() ); 148 } 149 150 public static void setSerializations( Configuration jobConf, Collection<String> provided ) 151 { 152 String serializations = getSerializations( jobConf ); 153 154 LinkedList<String> list = new LinkedList<String>(); 155 156 list.addAll( provided ); 157 158 if( serializations != null && !serializations.isEmpty() ) 159 Collections.addAll( list, serializations.split( "," ) ); 160 161 // required by MultiInputSplit 162 String writable = WritableSerialization.class.getName(); 163 String tuple = TupleSerialization.class.getName(); 164 165 list.remove( writable ); 166 list.remove( tuple ); 167 168 list.addFirst( writable ); 169 list.addFirst( tuple ); 170 171 // make writable last 172 jobConf.set( HADOOP_IO_SERIALIZATIONS, Util.join( list, "," ) ); 173 } 174 175 static String getSerializations( Configuration jobConf ) 176 { 177 return jobConf.get( HADOOP_IO_SERIALIZATIONS, null ); 178 } 179 180 public static Comparator getDefaultComparator( Comparator comparator, Configuration jobConf ) 181 { 182 String typeName = jobConf.get( FlowProps.DEFAULT_ELEMENT_COMPARATOR ); 183 184 if( Util.isEmpty( typeName ) ) 185 return null; 186 187 if( comparator == null ) 188 return createComparator( jobConf, typeName ); 189 190 if( comparator.getClass().getName().equals( typeName ) && !( comparator instanceof Configured ) ) 191 return comparator; 192 193 return createComparator( jobConf, typeName ); 194 } 195 196 public static Comparator getDefaultComparator( Configuration jobConf ) 197 { 198 String typeName = jobConf.get( FlowProps.DEFAULT_ELEMENT_COMPARATOR ); 199 200 if( Util.isEmpty( typeName ) ) 201 return null; 202 203 return createComparator( jobConf, typeName ); 204 } 205 206 private static Comparator createComparator( Configuration jobConf, String typeName ) 207 { 208 LOG.debug( "using default comparator: {}", typeName ); 209 210 try 211 { 212 Class<Comparator> type = (Class<Comparator>) TupleSerialization.class.getClassLoader().loadClass( typeName ); 213 214 return ReflectionUtils.newInstance( type, jobConf ); 215 } 216 catch( ClassNotFoundException exception ) 217 { 218 throw new CascadingException( "unable to load class: " + typeName, exception ); 219 } 220 } 221 222 /** Constructor TupleSerialization creates a new TupleSerialization instance. */ 223 public TupleSerialization() 224 { 225 } 226 227 public TupleSerialization( final FlowProcess<? extends Configuration> flowProcess ) 228 { 229 super( new Configuration() 230 { 231 @Override 232 public String get( String name ) 233 { 234 return get( name, null ); 235 } 236 237 @Override 238 public String get( String name, String defaultValue ) 239 { 240 Object value = flowProcess.getProperty( name ); 241 return value == null ? defaultValue : String.valueOf( value ); 242 } 243 } ); 244 } 245 246 /** 247 * Constructor TupleSerialization creates a new TupleSerialization instance. 248 * 249 * @param conf of type Configuration 250 */ 251 public TupleSerialization( Configuration conf ) 252 { 253 super( conf ); 254 } 255 256 @Override 257 public void setConf( Configuration conf ) 258 { 259 super.setConf( conf ); 260 261 if( conf != null ) 262 defaultComparator = getDefaultComparator( conf ); 263 } 264 265 @Override 266 public Configuration getConf() 267 { 268 if( super.getConf() == null ) 269 setConf( new Configuration() ); 270 271 return super.getConf(); 272 } 273 274 public boolean areTypesIgnored() 275 { 276 if( typesIgnored == null ) 277 { 278 typesIgnored = getConf().getBoolean( TupleSerializationProps.IGNORE_TYPES, false ); 279 280 if( typesIgnored ) 281 LOG.info( "types are being ignored during serialization" ); 282 } 283 284 return typesIgnored; 285 } 286 287 public boolean areTypesRequired() 288 { 289 if( typesRequired == null ) 290 { 291 typesRequired = getConf().getBoolean( TupleSerializationProps.REQUIRE_TYPES, false ); 292 293 if( typesRequired ) 294 LOG.info( "types are being enforced during serialization" ); 295 } 296 297 return typesRequired; 298 } 299 300 SerializationFactory getSerializationFactory() 301 { 302 if( serializationFactory == null ) 303 serializationFactory = new SerializationFactory( getConf() ); 304 305 return serializationFactory; 306 } 307 308 public Fields getKeyFields() 309 { 310 if( keyFields == null && getFirstOrdinal() != null ) 311 keyFields = getKeyFieldsMap().get( getFirstOrdinal() ); 312 313 return keyFields; 314 } 315 316 public Class[] getKeyTypes() 317 { 318 Fields fields = getKeyFields(); 319 320 return getTypesFor( fields ); 321 } 322 323 public Class[] getTypesFor( Fields fields ) 324 { 325 if( areTypesIgnored() || fields == null ) 326 return null; 327 328 return fields.getTypesClasses(); 329 } 330 331 public Fields getSortFields() 332 { 333 if( sortFields == null && getFirstOrdinal() != null ) 334 sortFields = getSortFieldsMap().get( getFirstOrdinal() ); 335 336 return sortFields; 337 } 338 339 public Class[] getSortTypes() 340 { 341 return getTypesFor( getSortFields() ); 342 } 343 344 public Fields getValueFields() 345 { 346 if( valueFields == null && getFirstOrdinal() != null ) 347 valueFields = getValueFieldsMap().get( getFirstOrdinal() ); 348 349 return valueFields; 350 } 351 352 public Fields getMaskedValueFields() 353 { 354 return maskVoid( getValueFields(), getKeyFields() ); 355 } 356 357 public Class[] getValueTypes() 358 { 359 return getTypesFor( getValueFields() ); 360 } 361 362 public Map<Integer, Class[]> getKeyTypeMap() 363 { 364 if( areTypesIgnored() || getKeyFieldsMap() == null ) 365 return Collections.emptyMap(); 366 367 Map<Integer, Class[]> map = new HashMap<>(); 368 369 for( Map.Entry<Integer, Fields> entry : getKeyFieldsMap().entrySet() ) 370 map.put( entry.getKey(), entry.getValue().getTypesClasses() ); 371 372 return map; 373 } 374 375 public Map<Integer, Class[]> getValueTypeMap() 376 { 377 if( areTypesIgnored() || getValueFieldsMap() == null ) 378 return Collections.emptyMap(); 379 380 Map<Integer, Class[]> map = new HashMap<>(); 381 382 for( Map.Entry<Integer, Fields> entry : getValueFieldsMap().entrySet() ) 383 map.put( entry.getKey(), entry.getValue().getTypesClasses() ); 384 385 return map; 386 } 387 388 public Map<Integer, Class[]> getMaskedValueTypeMap() 389 { 390 if( areTypesIgnored() || getValueFieldsMap() == null ) 391 return Collections.emptyMap(); 392 393 Map<Integer, Fields> keyFieldsMap = getKeyFieldsMap(); 394 395 if( keyFieldsMap == null || keyFieldsMap.isEmpty() ) 396 return getValueTypeMap(); 397 398 Map<Integer, Class[]> map = new HashMap<>(); 399 400 for( Map.Entry<Integer, Fields> entry : getValueFieldsMap().entrySet() ) 401 { 402 Integer ordinal = entry.getKey(); 403 Fields valueFields = entry.getValue(); 404 Fields keyFields = keyFieldsMap.get( ordinal ); 405 406 map.put( ordinal, maskVoid( valueFields, keyFields ).getTypesClasses() ); 407 } 408 409 return map; 410 } 411 412 public List<Integer> getOrdinals() 413 { 414 if( ordinals == null ) 415 ordinals = Util.split( Integer.class, ",", getConf().get( "cascading.node.ordinals" ) ); 416 417 return ordinals; 418 } 419 420 public Integer getFirstOrdinal() 421 { 422 if( getOrdinals().isEmpty() ) 423 return null; 424 425 return Util.getFirst( getOrdinals() ); 426 } 427 428 public Map<Integer, Fields> getKeyFieldsMap() 429 { 430 if( keyFieldsMap == null ) 431 keyFieldsMap = getFields( getConf(), "cascading.node.key.fields" ); 432 433 return keyFieldsMap; 434 } 435 436 public Map<Integer, Fields> getSortFieldsMap() 437 { 438 if( sortFields == null ) 439 sortFieldsMap = getFields( getConf(), "cascading.node.sort.fields" ); 440 441 return sortFieldsMap; 442 } 443 444 public Map<Integer, Fields> getValueFieldsMap() 445 { 446 if( valueFieldsMap == null ) 447 valueFieldsMap = getFields( getConf(), "cascading.node.value.fields" ); 448 449 return valueFieldsMap; 450 } 451 452 /** Must be called before {@link #getClassNameFor(int)} and {@link #getTokenFor(String)} methods. */ 453 void initTokenMaps() 454 { 455 if( tokenClassesMap != null ) 456 return; 457 458 tokenClassesMap = new HashMap<>(); 459 classesTokensMap = new HashMap<>(); 460 461 String tokenProperty = getSerializationTokens( getConf() ); 462 463 if( tokenProperty != null ) 464 { 465 tokenProperty = tokenProperty.replaceAll( "\\s", "" ); // allow for whitespace in token set 466 467 for( String pair : tokenProperty.split( "," ) ) 468 { 469 String[] elements = pair.split( "=" ); 470 addToken( null, Integer.parseInt( elements[ 0 ] ), elements[ 1 ] ); 471 } 472 } 473 474 String serializationsString = getSerializations( getConf() ); 475 476 LOG.debug( "using hadoop serializations from the job conf: {} ", serializationsString ); 477 478 if( serializationsString == null ) 479 return; 480 481 String[] serializations = serializationsString.split( "," ); 482 483 for( String serializationName : serializations ) 484 { 485 try 486 { 487 Class type = getConf().getClassByName( serializationName ); 488 489 SerializationToken tokenAnnotation = (SerializationToken) type.getAnnotation( SerializationToken.class ); 490 491 if( tokenAnnotation == null ) 492 continue; 493 494 if( tokenAnnotation.tokens().length != tokenAnnotation.classNames().length ) 495 throw new CascadingException( "serialization annotation tokens and classNames must be the same length" ); 496 497 int[] tokens = tokenAnnotation.tokens(); 498 499 for( int i = 0; i < tokens.length; i++ ) 500 addToken( type, tokens[ i ], tokenAnnotation.classNames()[ i ] ); 501 } 502 catch( ClassNotFoundException exception ) 503 { 504 LOG.warn( "unable to load serialization class: {}", serializationName, exception ); 505 } 506 } 507 508 tokensSize = tokenClassesMap.size(); 509 } 510 511 private void addToken( Class type, int token, String className ) 512 { 513 if( type != null && !type.getName().startsWith( "cascading." ) && token < 128 ) 514 throw new CascadingException( "serialization annotation tokens may not be less than 128, was: " + token ); 515 516 if( tokenClassesMap.containsKey( token ) ) 517 { 518 if( type == null ) 519 throw new IllegalStateException( "duplicate serialization token: " + token + " for class: " + className + " found in properties" ); 520 521 throw new IllegalStateException( "duplicate serialization token: " + token + " for class: " + className + " on serialization: " + type.getName() ); 522 } 523 524 if( classesTokensMap.containsKey( className ) ) 525 { 526 if( type == null ) 527 throw new IllegalStateException( "duplicate serialization classname: " + className + " for token: " + token + " found in properties " ); 528 529 throw new IllegalStateException( "duplicate serialization classname: " + className + " for token: " + token + " on serialization: " + type.getName() ); 530 } 531 532 LOG.debug( "adding serialization token: {}, for classname: {}", token, className ); 533 534 tokenClassesMap.put( token, className ); 535 classesTokensMap.put( className, token ); 536 } 537 538 /** 539 * Returns the className for the given token. 540 * 541 * @param token of type int 542 * @return a String 543 */ 544 final String getClassNameFor( int token ) 545 { 546 initTokenMaps(); 547 548 if( tokensSize == 0 ) 549 return null; 550 551 return tokenClassesMap.get( token ); 552 } 553 554 final long getTokensMapSize() 555 { 556 return tokensSize; 557 } 558 559 /** 560 * Returns the token for the given className. 561 * 562 * @param className of type String 563 * @return an Integer 564 */ 565 final Integer getTokenFor( String className ) 566 { 567 initTokenMaps(); 568 569 if( tokensSize == 0 ) 570 return null; 571 572 return classesTokensMap.get( className ); 573 } 574 575 public Comparator getDefaultComparator() 576 { 577 return defaultComparator; 578 } 579 580 public Comparator getComparator( Class type ) 581 { 582 Serialization serialization = getSerialization( type ); 583 584 Comparator comparator = null; 585 586 if( serialization instanceof Comparison ) 587 comparator = ( (Comparison) serialization ).getComparator( type ); 588 589 if( comparator != null ) 590 return comparator; 591 592 return defaultComparator; 593 } 594 595 Serialization getSerialization( String className ) 596 { 597 return getSerialization( getClass( className ) ); 598 } 599 600 Serialization getSerialization( Class type ) 601 { 602 return getSerializationFactory().getSerialization( type ); 603 } 604 605 Serializer getNewSerializer( Class type ) 606 { 607 try 608 { 609 Serializer serializer = getSerializationFactory().getSerializer( type ); 610 611 if( serializer == null ) 612 throw new CascadingException( "unable to load serializer for: " + type.getName() + " from: " + getSerializationFactory().getClass().getName() ); 613 614 return serializer; 615 } 616 catch( NullPointerException exception ) 617 { 618 throw new CascadingException( "unable to load serializer for: " + type.getName() + " from: " + getSerializationFactory().getClass().getName() ); 619 } 620 } 621 622 Deserializer getNewDeserializer( String className ) 623 { 624 try 625 { 626 Deserializer deserializer = getSerializationFactory().getDeserializer( getClass( className ) ); 627 628 if( deserializer == null ) 629 throw new CascadingException( "unable to load deserializer for: " + className + " from: " + getSerializationFactory().getClass().getName() ); 630 631 return deserializer; 632 } 633 catch( NullPointerException exception ) 634 { 635 throw new CascadingException( "unable to load deserializer for: " + className + " from: " + getSerializationFactory().getClass().getName() ); 636 } 637 } 638 639 KeyTupleDeserializer getKeyTupleDeserializer() 640 { 641 return new KeyTupleDeserializer( getElementReader() ); 642 } 643 644 ValueTupleDeserializer getValueTupleDeserializer() 645 { 646 return new ValueTupleDeserializer( getElementReader() ); 647 } 648 649 TuplePairDeserializer getTuplePairDeserializer() 650 { 651 return new TuplePairDeserializer( getElementReader() ); 652 } 653 654 /** 655 * Method getElementReader returns the elementReader of this TupleSerialization object. 656 * 657 * @return the elementReader (type SerializationElementReader) of this TupleSerialization object. 658 */ 659 public SerializationElementReader getElementReader() 660 { 661 return new SerializationElementReader( this ); 662 } 663 664 TupleDeserializer getTupleDeserializer() 665 { 666 return new TupleDeserializer( getElementReader() ); 667 } 668 669 private KeyTupleSerializer getKeyTupleSerializer() 670 { 671 return new KeyTupleSerializer( getElementWriter() ); 672 } 673 674 private ValueTupleSerializer getValueTupleSerializer() 675 { 676 return new ValueTupleSerializer( getElementWriter() ); 677 } 678 679 private TuplePairSerializer getTuplePairSerializer() 680 { 681 return new TuplePairSerializer( getElementWriter() ); 682 } 683 684 KeyIndexTupleDeserializer getKeyIndexTupleDeserializer() 685 { 686 return new KeyIndexTupleDeserializer( getElementReader() ); 687 } 688 689 ValueIndexTupleDeserializer getValueIndexTupleDeserializer() 690 { 691 return new ValueIndexTupleDeserializer( getElementReader() ); 692 } 693 694 IndexTupleDeserializer getIndexTupleDeserializer() 695 { 696 return new IndexTupleDeserializer( getElementReader() ); 697 } 698 699 /** 700 * Method getElementWriter returns the elementWriter of this TupleSerialization object. 701 * 702 * @return the elementWriter (type SerializationElementWriter) of this TupleSerialization object. 703 */ 704 public SerializationElementWriter getElementWriter() 705 { 706 return new SerializationElementWriter( this ); 707 } 708 709 private TupleSerializer getTupleSerializer() 710 { 711 return new TupleSerializer( getElementWriter() ); 712 } 713 714 private KeyIndexTupleSerializer getKeyIndexTupleSerializer() 715 { 716 return new KeyIndexTupleSerializer( getElementWriter() ); 717 } 718 719 private ValueIndexTupleSerializer getValueIndexTupleSerializer() 720 { 721 return new ValueIndexTupleSerializer( getElementWriter() ); 722 } 723 724 private IndexTupleSerializer getIndexTupleSerializer() 725 { 726 return new IndexTupleSerializer( getElementWriter() ); 727 } 728 729 public boolean accept( Class c ) 730 { 731 return Tuple.class == c || 732 KeyTuple.class == c || ValueTuple.class == c || 733 KeyIndexTuple.class == c || ValueIndexTuple.class == c || 734 TuplePair.class == c || IndexTuple.class == c; 735 } 736 737 public Deserializer getDeserializer( Class c ) 738 { 739 if( c == Tuple.class ) 740 return getTupleDeserializer(); 741 else if( c == KeyTuple.class ) 742 return getKeyTupleDeserializer(); 743 else if( c == ValueTuple.class ) 744 return getValueTupleDeserializer(); 745 else if( c == KeyIndexTuple.class ) 746 return getKeyIndexTupleDeserializer(); 747 else if( c == ValueIndexTuple.class ) 748 return getValueIndexTupleDeserializer(); 749 else if( c == TuplePair.class ) 750 return getTuplePairDeserializer(); 751 else if( c == IndexTuple.class ) 752 return getIndexTupleDeserializer(); 753 754 throw new IllegalArgumentException( "unknown class, cannot deserialize: " + c.getName() ); 755 } 756 757 public Serializer getSerializer( Class c ) 758 { 759 if( c == Tuple.class ) 760 return getTupleSerializer(); 761 else if( c == KeyTuple.class ) 762 return getKeyTupleSerializer(); 763 else if( c == ValueTuple.class ) 764 return getValueTupleSerializer(); 765 else if( c == KeyIndexTuple.class ) 766 return getKeyIndexTupleSerializer(); 767 else if( c == ValueIndexTuple.class ) 768 return getValueIndexTupleSerializer(); 769 else if( c == TuplePair.class ) 770 return getTuplePairSerializer(); 771 else if( c == IndexTuple.class ) 772 return getIndexTupleSerializer(); 773 774 throw new IllegalArgumentException( "unknown class, cannot serialize: " + c.getName() ); 775 } 776 777 public Class getClass( String className ) 778 { 779 Class type = classCache.get( className ); 780 781 if( type != null ) 782 return type; 783 784 try 785 { 786 if( className.charAt( 0 ) == '[' ) 787 type = Class.forName( className, true, Thread.currentThread().getContextClassLoader() ); 788 else 789 type = Thread.currentThread().getContextClassLoader().loadClass( className ); 790 } 791 catch( ClassNotFoundException exception ) 792 { 793 throw new TupleException( "unable to load class named: " + className, exception ); 794 } 795 796 classCache.put( className, type ); 797 798 return type; 799 } 800 801 public static Map<Integer, Fields> getFields( Configuration conf, String property ) 802 { 803 try 804 { 805 return HadoopUtil.getFields( conf, property ); 806 } 807 catch( IOException exception ) 808 { 809 LOG.warn( "unable to get fields for: " + property ); 810 811 return Collections.emptyMap(); 812 } 813 } 814 815 private static Fields maskVoid( Fields fields, Fields mask ) 816 { 817 if( fields == null ) 818 return null; 819 820 if( mask == null || !fields.hasTypes() || !mask.hasTypes() ) 821 return fields; 822 823 Fields voidedKey = mask.applyTypes( Fields.size( mask.size(), Void.class ) ); 824 825 fields = fields.applyTypes( voidedKey ); 826 827 return fields; 828 } 829 830 public static class SerializationElementReader implements TupleInputStream.ElementReader 831 { 832 /** Field LOG */ 833 private static final Logger LOG = LoggerFactory.getLogger( SerializationElementReader.class ); 834 835 /** Field tupleSerialization */ 836 private final TupleSerialization tupleSerialization; 837 838 /** Field deserializers */ 839 final Map<String, Deserializer> deserializers = new HashMap<String, Deserializer>(); 840 841 /** 842 * Constructor SerializationElementReader creates a new SerializationElementReader instance. 843 * 844 * @param tupleSerialization of type TupleSerialization 845 */ 846 public SerializationElementReader( TupleSerialization tupleSerialization ) 847 { 848 this.tupleSerialization = tupleSerialization; 849 } 850 851 public TupleSerialization getTupleSerialization() 852 { 853 return tupleSerialization; 854 } 855 856 public Object read( int token, DataInputStream inputStream ) throws IOException 857 { 858 String className = getClassNameFor( token, inputStream ); 859 Deserializer deserializer = getDeserializerFor( inputStream, className ); 860 861 Object foundObject = null; 862 Object object; 863 864 try 865 { 866 object = deserializer.deserialize( foundObject ); 867 } 868 catch( IOException exception ) 869 { 870 LOG.error( "failed deserializing token: " + token + " with classname: " + className, exception ); 871 872 throw exception; 873 } 874 875 return object; 876 } 877 878 public Object read( Class type, DataInputStream inputStream ) throws IOException 879 { 880 String className = type.getName(); 881 Deserializer deserializer = getDeserializerFor( inputStream, className ); 882 883 Object foundObject = null; 884 Object object; 885 886 try 887 { 888 object = deserializer.deserialize( foundObject ); 889 } 890 catch( IOException exception ) 891 { 892 LOG.error( "failed deserializing: " + className, exception ); 893 894 throw exception; 895 } 896 897 return object; 898 } 899 900 @Override 901 public Comparator getComparatorFor( int token, DataInputStream inputStream ) throws IOException 902 { 903 Class type = tupleSerialization.getClass( getClassNameFor( token, inputStream ) ); 904 905 return tupleSerialization.getComparator( type ); 906 } 907 908 private Deserializer getDeserializerFor( DataInputStream inputStream, String className ) throws IOException 909 { 910 Deserializer deserializer = deserializers.get( className ); 911 912 if( deserializer == null ) 913 { 914 deserializer = tupleSerialization.getNewDeserializer( className ); 915 deserializer.open( inputStream ); 916 deserializers.put( className, deserializer ); 917 } 918 919 return deserializer; 920 } 921 922 public String getClassNameFor( int token, DataInputStream inputStream ) throws IOException 923 { 924 String className = tupleSerialization.getClassNameFor( token ); 925 926 try 927 { 928 if( className == null ) 929 className = WritableUtils.readString( inputStream ); 930 } 931 catch( IOException exception ) 932 { 933 LOG.error( "unable to resolve token: {}, to a valid classname, with token map of size: {}, rethrowing IOException", token, tupleSerialization.getTokensMapSize() ); 934 throw exception; 935 } 936 937 return className; 938 } 939 940 public void close() 941 { 942 if( deserializers.size() == 0 ) 943 return; 944 945 Collection<Deserializer> clone = new ArrayList<Deserializer>( deserializers.values() ); 946 947 deserializers.clear(); 948 949 for( Deserializer deserializer : clone ) 950 { 951 try 952 { 953 deserializer.close(); 954 } 955 catch( IOException exception ) 956 { 957 // do nothing 958 } 959 } 960 } 961 } 962 963 public static class SerializationElementWriter implements TupleOutputStream.ElementWriter 964 { 965 /** Field LOG */ 966 private static final Logger LOG = LoggerFactory.getLogger( SerializationElementWriter.class ); 967 968 /** Field tupleSerialization */ 969 private final TupleSerialization tupleSerialization; 970 971 /** Field serializers */ 972 final Map<Class, Serializer> serializers = new HashMap<Class, Serializer>(); 973 974 public SerializationElementWriter( TupleSerialization tupleSerialization ) 975 { 976 this.tupleSerialization = tupleSerialization; 977 } 978 979 public TupleSerialization getTupleSerialization() 980 { 981 return tupleSerialization; 982 } 983 984 public void write( DataOutputStream outputStream, Object object ) throws IOException 985 { 986 Class<?> type = object.getClass(); 987 String className = type.getName(); 988 Integer token = tupleSerialization.getTokenFor( className ); 989 990 if( token == null ) 991 { 992 LOG.debug( "no serialization token found for classname: {}", className ); 993 994 WritableUtils.writeVInt( outputStream, HadoopTupleOutputStream.WRITABLE_TOKEN ); // denotes to punt to hadoop serialization 995 WritableUtils.writeString( outputStream, className ); 996 } 997 else 998 { 999 WritableUtils.writeVInt( outputStream, token ); 1000 } 1001 1002 Serializer serializer = getSerializer( outputStream, type ); 1003 1004 try 1005 { 1006 serializer.serialize( object ); 1007 } 1008 catch( IOException exception ) 1009 { 1010 LOG.error( "failed serializing token: " + token + " with classname: " + className, exception ); 1011 1012 throw exception; 1013 } 1014 } 1015 1016 private Serializer getSerializer( DataOutputStream outputStream, Class<?> type ) throws IOException 1017 { 1018 Serializer serializer = serializers.get( type ); 1019 1020 if( serializer == null ) 1021 { 1022 serializer = tupleSerialization.getNewSerializer( type ); 1023 serializer.open( outputStream ); 1024 serializers.put( type, serializer ); 1025 } 1026 1027 return serializer; 1028 } 1029 1030 public void write( DataOutputStream outputStream, Class<?> type, Object object ) throws IOException 1031 { 1032 Serializer serializer = getSerializer( outputStream, type ); 1033 1034 try 1035 { 1036 serializer.serialize( object ); 1037 } 1038 catch( IOException exception ) 1039 { 1040 LOG.error( "failed serializing type: " + type.getName(), exception ); 1041 1042 throw exception; 1043 } 1044 } 1045 1046 public void close() 1047 { 1048 if( serializers.size() == 0 ) 1049 return; 1050 1051 Collection<Serializer> clone = new ArrayList<Serializer>( serializers.values() ); 1052 1053 serializers.clear(); 1054 1055 for( Serializer serializer : clone ) 1056 { 1057 try 1058 { 1059 serializer.close(); 1060 } 1061 catch( IOException exception ) 1062 { 1063 // do nothing 1064 } 1065 } 1066 } 1067 } 1068 }