001/*
002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.flow.hadoop;
023
024import java.io.IOException;
025import java.util.Iterator;
026
027import cascading.CascadingException;
028import cascading.flow.FlowException;
029import cascading.flow.FlowNode;
030import cascading.flow.FlowSession;
031import cascading.flow.FlowStep;
032import cascading.flow.SliceCounters;
033import cascading.flow.StepCounters;
034import cascading.flow.hadoop.planner.HadoopFlowStepJob;
035import cascading.flow.hadoop.stream.HadoopGroupGate;
036import cascading.flow.hadoop.stream.graph.HadoopReduceStreamGraph;
037import cascading.flow.hadoop.util.HadoopUtil;
038import cascading.flow.hadoop.util.TimedIterator;
039import cascading.flow.planner.BaseFlowNode;
040import cascading.flow.stream.StopDataNotificationException;
041import cascading.flow.stream.duct.Duct;
042import cascading.flow.stream.element.ElementDuct;
043import cascading.tap.Tap;
044import cascading.tuple.Tuple;
045import cascading.util.LogUtil;
046import cascading.util.Util;
047import org.apache.hadoop.mapred.JobConf;
048import org.apache.hadoop.mapred.MapReduceBase;
049import org.apache.hadoop.mapred.OutputCollector;
050import org.apache.hadoop.mapred.Reducer;
051import org.apache.hadoop.mapred.Reporter;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055import static cascading.flow.hadoop.util.HadoopMRUtil.readStateFromDistCache;
056import static cascading.flow.hadoop.util.HadoopUtil.deserializeBase64;
057import static cascading.util.LogUtil.logCounters;
058import static cascading.util.LogUtil.logMemory;
059
060/** Class FlowReducer is the Hadoop Reducer implementation. */
061public class FlowReducer extends MapReduceBase implements Reducer
062  {
063  private static final Logger LOG = LoggerFactory.getLogger( FlowReducer.class );
064
065  private FlowNode flowNode;
066  private HadoopReduceStreamGraph streamGraph;
067  private HadoopFlowProcess currentProcess;
068  private TimedIterator<Tuple>[] timedIterators;
069
070  private boolean calledPrepare = false;
071  private HadoopGroupGate group;
072  private long processBeginTime;
073
074  /** Constructor FlowReducer creates a new FlowReducer instance. */
075  public FlowReducer()
076    {
077    }
078
079  @Override
080  public void configure( JobConf jobConf )
081    {
082    try
083      {
084      super.configure( jobConf );
085      HadoopUtil.initLog4j( jobConf );
086
087      LOG.info( "cascading version: {}", jobConf.get( "cascading.version", "" ) );
088      LOG.info( "child jvm opts: {}", jobConf.get( "mapred.child.java.opts", "" ) );
089
090      currentProcess = new HadoopFlowProcess( new FlowSession(), jobConf, false );
091
092      timedIterators = TimedIterator.iterators( new TimedIterator<Tuple>( currentProcess, SliceCounters.Read_Duration, SliceCounters.Tuples_Read ) );
093
094      String reduceNodeState = jobConf.getRaw( "cascading.flow.step.node.reduce" );
095
096      if( reduceNodeState == null )
097        reduceNodeState = readStateFromDistCache( jobConf, jobConf.get( FlowStep.CASCADING_FLOW_STEP_ID ), "reduce" );
098
099      flowNode = deserializeBase64( reduceNodeState, jobConf, BaseFlowNode.class );
100
101      LOG.info( "flow node id: {}, ordinal: {}", flowNode.getID(), flowNode.getOrdinal() );
102
103      streamGraph = new HadoopReduceStreamGraph( currentProcess, flowNode, Util.getFirst( flowNode.getSourceElements() ) );
104
105      group = (HadoopGroupGate) streamGraph.getHeads().iterator().next();
106
107      for( Duct head : streamGraph.getHeads() )
108        LOG.info( "sourcing from: " + ( (ElementDuct) head ).getFlowElement() );
109
110      for( Duct tail : streamGraph.getTails() )
111        LOG.info( "sinking to: " + ( (ElementDuct) tail ).getFlowElement() );
112
113      for( Tap trap : flowNode.getTraps() )
114        LOG.info( "trapping to: " + trap );
115
116      logMemory( LOG, "flow node id: " + flowNode.getID() + ", mem on start" );
117      }
118    catch( Throwable throwable )
119      {
120      reportIfLocal( throwable );
121
122      if( throwable instanceof CascadingException )
123        throw (CascadingException) throwable;
124
125      throw new FlowException( "internal error during reducer configuration", throwable );
126      }
127    }
128
129  public void reduce( Object key, Iterator values, OutputCollector output, Reporter reporter ) throws IOException
130    {
131    currentProcess.setReporter( reporter );
132    currentProcess.setOutputCollector( output );
133
134    timedIterators[ 0 ].reset( values ); // allows us to count read tuples
135
136    if( !calledPrepare )
137      {
138      streamGraph.prepare();
139
140      calledPrepare = true;
141
142      processBeginTime = System.currentTimeMillis();
143      currentProcess.increment( SliceCounters.Process_Begin_Time, processBeginTime );
144      currentProcess.increment( StepCounters.Process_Begin_Time, processBeginTime );
145
146      group.start( group );
147      }
148
149    try
150      {
151      group.accept( (Tuple) key, timedIterators );
152      }
153    catch( StopDataNotificationException exception )
154      {
155      LogUtil.logWarnOnce( LOG, "received unsupported stop data notification, ignoring: {}", exception.getMessage() );
156      }
157    catch( OutOfMemoryError error )
158      {
159      throw error;
160      }
161    catch( Throwable throwable )
162      {
163      reportIfLocal( throwable );
164
165      if( throwable instanceof CascadingException )
166        throw (CascadingException) throwable;
167
168      throw new FlowException( "internal error during reducer execution", throwable );
169      }
170    }
171
172  @Override
173  public void close() throws IOException
174    {
175    try
176      {
177      if( calledPrepare )
178        {
179        group.complete( group );
180
181        streamGraph.cleanup();
182        }
183
184      super.close();
185      }
186    finally
187      {
188      if( currentProcess != null )
189        {
190        long processEndTime = System.currentTimeMillis();
191        currentProcess.increment( SliceCounters.Process_End_Time, processEndTime );
192        currentProcess.increment( SliceCounters.Process_Duration, processEndTime - processBeginTime );
193        currentProcess.increment( StepCounters.Process_End_Time, processEndTime );
194        currentProcess.increment( StepCounters.Process_Duration, processEndTime - processBeginTime );
195        }
196
197      String message = "flow node id: " + flowNode.getID();
198      logMemory( LOG, message + ", mem on close" );
199      logCounters( LOG, message + ", counter:", currentProcess );
200      }
201    }
202
203  /**
204   * Report the error to HadoopFlowStepJob if we are running in Hadoops local mode.
205   *
206   * @param throwable The throwable that was thrown.
207   */
208  private void reportIfLocal( Throwable throwable )
209    {
210    if( HadoopUtil.isLocal( currentProcess.getJobConf() ) )
211      HadoopFlowStepJob.reportLocalError( throwable );
212    }
213  }