001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.tap.hadoop.io;
022
023import java.io.DataInput;
024import java.io.DataOutput;
025import java.io.IOException;
026import java.util.HashMap;
027import java.util.Map;
028
029import cascading.flow.hadoop.util.HadoopUtil;
030import cascading.tap.type.FileType;
031import org.apache.hadoop.fs.Path;
032import org.apache.hadoop.io.WritableUtils;
033import org.apache.hadoop.mapred.FileSplit;
034import org.apache.hadoop.mapred.InputSplit;
035import org.apache.hadoop.mapred.JobConf;
036import org.apache.hadoop.mapred.JobConfigurable;
037import org.apache.hadoop.util.ReflectionUtils;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041/** Class MultiInputSplit is used by MultiInputFormat */
042public class MultiInputSplit implements InputSplit, JobConfigurable
043  {
044  /**
045   * @deprecated see {@link FileType#CASCADING_SOURCE_PATH}.
046   */
047  @Deprecated
048  public static final String CASCADING_SOURCE_PATH = FileType.CASCADING_SOURCE_PATH;
049  private static final Logger LOG = LoggerFactory.getLogger( MultiInputSplit.class );
050
051  /** Field jobConf */
052  private transient JobConf jobConf;
053  /** Field inputSplit */
054  InputSplit inputSplit;
055  /** Field config */
056  Map<String, String> config;
057
058  /**
059   * Method getCurrentTapSourcePath finds and returns the current source Tap filename path, if any.
060   * <p>
061   * Use this method inside an Operation to find the current file being processed.
062   *
063   * @param jobConf
064   * @return a String
065   */
066  public static String getCurrentTapSourcePath( JobConf jobConf )
067    {
068    return jobConf.get( FileType.CASCADING_SOURCE_PATH );
069    }
070
071  public MultiInputSplit( InputSplit inputSplit, Map<String, String> config )
072    {
073    if( inputSplit == null )
074      throw new IllegalArgumentException( "input split may not be null" );
075
076    if( config == null )
077      throw new IllegalArgumentException( "config may not be null" );
078
079    this.inputSplit = inputSplit;
080    this.config = config;
081    }
082
083  /**
084   * This constructor is used internally by Hadoop. it is expected {@link #configure(org.apache.hadoop.mapred.JobConf)}
085   * and {@link #readFields(java.io.DataInput)} are called to properly initialize.
086   */
087  public MultiInputSplit()
088    {
089    }
090
091  public void configure( JobConf jobConf )
092    {
093    this.jobConf = jobConf;
094    }
095
096  public long getLength() throws IOException
097    {
098    return inputSplit.getLength();
099    }
100
101  public String[] getLocations() throws IOException
102    {
103    return inputSplit.getLocations();
104    }
105
106  public InputSplit getWrappedInputSplit()
107    {
108    return inputSplit;
109    }
110
111  public void write( DataOutput out ) throws IOException
112    {
113    out.writeUTF( inputSplit.getClass().getName() );
114
115    String[] keys = config.keySet().toArray( new String[ config.size() ] );
116    String[] values = new String[ keys.length ];
117
118    for( int i = 0; i < keys.length; i++ )
119      values[ i ] = config.get( keys[ i ] );
120
121    WritableUtils.writeStringArray( out, keys );
122    WritableUtils.writeStringArray( out, values );
123
124    inputSplit.write( out );
125    }
126
127  public void readFields( DataInput in ) throws IOException
128    {
129    String splitType = in.readUTF();
130    config = new HashMap<String, String>();
131
132    String[] keys = WritableUtils.readStringArray( in );
133    String[] values = WritableUtils.readStringArray( in );
134
135    for( int i = 0; i < keys.length; i++ )
136      config.put( keys[ i ], values[ i ] );
137
138    if( LOG.isDebugEnabled() )
139      {
140      LOG.debug( "current split config diff:" );
141      for( Map.Entry<String, String> entry : config.entrySet() )
142        LOG.debug( "key: {}, value: {}", entry.getKey(), entry.getValue() );
143      }
144
145    JobConf currentConf = HadoopUtil.mergeConf( jobConf, config, false );
146
147    try
148      {
149      inputSplit = (InputSplit) ReflectionUtils.newInstance( currentConf.getClassByName( splitType ), currentConf );
150      }
151    catch( ClassNotFoundException exp )
152      {
153      throw new IOException( "split class " + splitType + " not found" );
154      }
155
156    inputSplit.readFields( in );
157
158    if( inputSplit instanceof FileSplit )
159      {
160      Path path = ( (FileSplit) inputSplit ).getPath();
161
162      if( path != null )
163        {
164        jobConf.set( FileType.CASCADING_SOURCE_PATH, path.toString() );
165
166        LOG.info( "current split input path: {}", path );
167        }
168      }
169    }
170  }