001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.tap.hadoop.io; 022 023import java.io.DataInput; 024import java.io.DataOutput; 025import java.io.IOException; 026import java.util.HashMap; 027import java.util.Map; 028 029import cascading.flow.hadoop.util.HadoopUtil; 030import cascading.tap.type.FileType; 031import org.apache.hadoop.fs.Path; 032import org.apache.hadoop.io.WritableUtils; 033import org.apache.hadoop.mapred.FileSplit; 034import org.apache.hadoop.mapred.InputSplit; 035import org.apache.hadoop.mapred.JobConf; 036import org.apache.hadoop.mapred.JobConfigurable; 037import org.apache.hadoop.util.ReflectionUtils; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041/** Class MultiInputSplit is used by MultiInputFormat */ 042public class MultiInputSplit implements InputSplit, JobConfigurable 043 { 044 /** 045 * @deprecated see {@link FileType#CASCADING_SOURCE_PATH}. 046 */ 047 @Deprecated 048 public static final String CASCADING_SOURCE_PATH = FileType.CASCADING_SOURCE_PATH; 049 private static final Logger LOG = LoggerFactory.getLogger( MultiInputSplit.class ); 050 051 /** Field jobConf */ 052 private transient JobConf jobConf; 053 /** Field inputSplit */ 054 InputSplit inputSplit; 055 /** Field config */ 056 Map<String, String> config; 057 058 /** 059 * Method getCurrentTapSourcePath finds and returns the current source Tap filename path, if any. 060 * <p> 061 * Use this method inside an Operation to find the current file being processed. 062 * 063 * @param jobConf 064 * @return a String 065 */ 066 public static String getCurrentTapSourcePath( JobConf jobConf ) 067 { 068 return jobConf.get( FileType.CASCADING_SOURCE_PATH ); 069 } 070 071 public MultiInputSplit( InputSplit inputSplit, Map<String, String> config ) 072 { 073 if( inputSplit == null ) 074 throw new IllegalArgumentException( "input split may not be null" ); 075 076 if( config == null ) 077 throw new IllegalArgumentException( "config may not be null" ); 078 079 this.inputSplit = inputSplit; 080 this.config = config; 081 } 082 083 /** 084 * This constructor is used internally by Hadoop. it is expected {@link #configure(org.apache.hadoop.mapred.JobConf)} 085 * and {@link #readFields(java.io.DataInput)} are called to properly initialize. 086 */ 087 public MultiInputSplit() 088 { 089 } 090 091 public void configure( JobConf jobConf ) 092 { 093 this.jobConf = jobConf; 094 } 095 096 public long getLength() throws IOException 097 { 098 return inputSplit.getLength(); 099 } 100 101 public String[] getLocations() throws IOException 102 { 103 return inputSplit.getLocations(); 104 } 105 106 public InputSplit getWrappedInputSplit() 107 { 108 return inputSplit; 109 } 110 111 public void write( DataOutput out ) throws IOException 112 { 113 out.writeUTF( inputSplit.getClass().getName() ); 114 115 String[] keys = config.keySet().toArray( new String[ config.size() ] ); 116 String[] values = new String[ keys.length ]; 117 118 for( int i = 0; i < keys.length; i++ ) 119 values[ i ] = config.get( keys[ i ] ); 120 121 WritableUtils.writeStringArray( out, keys ); 122 WritableUtils.writeStringArray( out, values ); 123 124 inputSplit.write( out ); 125 } 126 127 public void readFields( DataInput in ) throws IOException 128 { 129 String splitType = in.readUTF(); 130 config = new HashMap<String, String>(); 131 132 String[] keys = WritableUtils.readStringArray( in ); 133 String[] values = WritableUtils.readStringArray( in ); 134 135 for( int i = 0; i < keys.length; i++ ) 136 config.put( keys[ i ], values[ i ] ); 137 138 if( LOG.isDebugEnabled() ) 139 { 140 LOG.debug( "current split config diff:" ); 141 for( Map.Entry<String, String> entry : config.entrySet() ) 142 LOG.debug( "key: {}, value: {}", entry.getKey(), entry.getValue() ); 143 } 144 145 JobConf currentConf = HadoopUtil.mergeConf( jobConf, config, false ); 146 147 try 148 { 149 inputSplit = (InputSplit) ReflectionUtils.newInstance( currentConf.getClassByName( splitType ), currentConf ); 150 } 151 catch( ClassNotFoundException exp ) 152 { 153 throw new IOException( "split class " + splitType + " not found" ); 154 } 155 156 inputSplit.readFields( in ); 157 158 if( inputSplit instanceof FileSplit ) 159 { 160 Path path = ( (FileSplit) inputSplit ).getPath(); 161 162 if( path != null ) 163 { 164 jobConf.set( FileType.CASCADING_SOURCE_PATH, path.toString() ); 165 166 LOG.info( "current split input path: {}", path ); 167 } 168 } 169 } 170 }