001/*
002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.tap.hadoop.io;
023
024import java.io.Closeable;
025import java.io.IOException;
026
027import cascading.flow.FlowProcess;
028import cascading.flow.hadoop.MapRed;
029import cascading.flow.hadoop.util.HadoopUtil;
030import cascading.tap.Tap;
031import cascading.tap.TapException;
032import cascading.tap.hadoop.util.Hadoop18TapUtil;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.Path;
035import org.apache.hadoop.mapred.FileOutputFormat;
036import org.apache.hadoop.mapred.OutputCollector;
037import org.apache.hadoop.mapred.OutputFormat;
038import org.apache.hadoop.mapred.RecordReader;
039import org.apache.hadoop.mapred.RecordWriter;
040import org.apache.hadoop.mapred.Reporter;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043
044import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance;
045
046/**
047 *
048 */
049public class TapOutputCollector implements OutputCollector, Closeable
050  {
051  private static final Logger LOG = LoggerFactory.getLogger( TapOutputCollector.class );
052
053  public static final String PART_TASK_PATTERN = "%s%spart-%05d";
054  public static final String PART_TASK_SEQ_PATTERN = "%s%spart-%05d-%05d";
055
056  /** Field conf */
057  private Configuration conf;
058  /** Field writer */
059  private RecordWriter writer;
060  /** Field filenamePattern */
061  private String filenamePattern;
062  /** Field filename */
063  private String filename;
064  /** Field tap */
065  private Tap<Configuration, RecordReader, OutputCollector> tap;
066  /** Field prefix */
067  private String prefix;
068  /** Field sequence */
069  private long sequence;
070  /** Field isFileOutputFormat */
071  private boolean isFileOutputFormat;
072  private final FlowProcess<? extends Configuration> flowProcess;
073
074  public TapOutputCollector( FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap ) throws IOException
075    {
076    this( flowProcess, tap, null );
077    }
078
079  public TapOutputCollector( FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, String prefix ) throws IOException
080    {
081    this( flowProcess, tap, prefix, -1 );
082    }
083
084  public TapOutputCollector( FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, String prefix, long sequence ) throws IOException
085    {
086    this.tap = tap;
087    this.sequence = sequence;
088    this.prefix = prefix == null || prefix.length() == 0 ? null : prefix;
089    this.flowProcess = flowProcess;
090    this.conf = this.flowProcess.getConfigCopy();
091    this.filenamePattern = this.conf.get( "cascading.tapcollector.partname", sequence == -1 ? PART_TASK_PATTERN : PART_TASK_SEQ_PATTERN );
092
093    initialize();
094    }
095
096  protected void initialize() throws IOException
097    {
098    tap.sinkConfInit( flowProcess, conf );
099
100    OutputFormat outputFormat = asJobConfInstance( conf ).getOutputFormat();
101
102    // todo: use OutputCommitter class
103
104    isFileOutputFormat = outputFormat instanceof FileOutputFormat;
105
106    if( isFileOutputFormat )
107      {
108      Hadoop18TapUtil.setupJob( conf );
109      Hadoop18TapUtil.setupTask( conf );
110
111      int partition = conf.getInt( "mapred.task.partition", conf.getInt( "mapreduce.task.partition", 0 ) );
112
113      long localSequence = sequence == -1 ? 0 : sequence;
114
115      if( prefix != null )
116        filename = String.format( filenamePattern, prefix, "/", partition, localSequence );
117      else
118        filename = String.format( filenamePattern, "", "", partition, localSequence );
119      }
120
121    LOG.info( "creating path: {}", filename );
122
123    writer = outputFormat.getRecordWriter( null, asJobConfInstance( conf ), filename, getReporter() );
124    }
125
126  private Reporter getReporter()
127    {
128    Reporter reporter = Reporter.NULL;
129
130    if( flowProcess instanceof MapRed )
131      reporter = ( (MapRed) flowProcess ).getReporter(); // may return Reporter.NULL
132
133    return reporter;
134    }
135
136  /**
137   * Method collect writes the given values to the {@link Tap} this instance encapsulates.
138   *
139   * @param writableComparable of type WritableComparable
140   * @param writable           of type Writable
141   * @throws IOException when
142   */
143  public void collect( Object writableComparable, Object writable ) throws IOException
144    {
145    flowProcess.keepAlive();
146    writer.write( writableComparable, writable );
147    }
148
149  public void close()
150    {
151    try
152      {
153      if( isFileOutputFormat )
154        LOG.info( "closing tap collector for: {}", new Path( tap.getIdentifier(), filename ) );
155      else
156        LOG.info( "closing tap collector for: {}", tap );
157
158      try
159        {
160        writer.close( getReporter() );
161        }
162      finally
163        {
164        if( isFileOutputFormat )
165          {
166          boolean needsTaskCommit = Hadoop18TapUtil.needsTaskCommit( conf );
167
168          boolean cleanJob = true;
169
170          if( needsTaskCommit )
171            cleanJob = Hadoop18TapUtil.commitTask( conf );
172
173          if( cleanJob ) // don't delete _temporary if still contents
174            Hadoop18TapUtil.cleanupJob( conf );
175
176          if( !HadoopUtil.isInflow( conf ) )
177            Hadoop18TapUtil.writeSuccessMarker( conf );
178          }
179        }
180      }
181    catch( IOException exception )
182      {
183      LOG.warn( "exception closing: {}", filename, exception );
184      throw new TapException( "exception closing: " + filename, exception );
185      }
186    }
187  }