001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.flow.hadoop;
022
023import java.io.IOException;
024import java.util.Iterator;
025
026import cascading.CascadingException;
027import cascading.flow.FlowException;
028import cascading.flow.FlowNode;
029import cascading.flow.FlowSession;
030import cascading.flow.FlowStep;
031import cascading.flow.SliceCounters;
032import cascading.flow.StepCounters;
033import cascading.flow.hadoop.planner.HadoopFlowStepJob;
034import cascading.flow.hadoop.stream.HadoopGroupGate;
035import cascading.flow.hadoop.stream.graph.HadoopReduceStreamGraph;
036import cascading.flow.hadoop.util.HadoopUtil;
037import cascading.flow.hadoop.util.TimedIterator;
038import cascading.flow.planner.BaseFlowNode;
039import cascading.flow.stream.StopDataNotificationException;
040import cascading.flow.stream.duct.Duct;
041import cascading.flow.stream.element.ElementDuct;
042import cascading.tap.Tap;
043import cascading.tuple.Tuple;
044import cascading.util.LogUtil;
045import cascading.util.Util;
046import org.apache.hadoop.mapred.JobConf;
047import org.apache.hadoop.mapred.MapReduceBase;
048import org.apache.hadoop.mapred.OutputCollector;
049import org.apache.hadoop.mapred.Reducer;
050import org.apache.hadoop.mapred.Reporter;
051import org.slf4j.Logger;
052import org.slf4j.LoggerFactory;
053
054import static cascading.flow.hadoop.util.HadoopMRUtil.readStateFromDistCache;
055import static cascading.flow.hadoop.util.HadoopUtil.deserializeBase64;
056import static cascading.util.LogUtil.logCounters;
057import static cascading.util.LogUtil.logMemory;
058
059/** Class FlowReducer is the Hadoop Reducer implementation. */
060public class FlowReducer extends MapReduceBase implements Reducer
061  {
062  private static final Logger LOG = LoggerFactory.getLogger( FlowReducer.class );
063
064  private FlowNode flowNode;
065  private HadoopReduceStreamGraph streamGraph;
066  private HadoopFlowProcess currentProcess;
067  private TimedIterator<Tuple>[] timedIterators;
068
069  private boolean calledPrepare = false;
070  private HadoopGroupGate group;
071  private long processBeginTime;
072
073  /** Constructor FlowReducer creates a new FlowReducer instance. */
074  public FlowReducer()
075    {
076    }
077
078  @Override
079  public void configure( JobConf jobConf )
080    {
081    try
082      {
083      super.configure( jobConf );
084      HadoopUtil.initLog4j( jobConf );
085
086      LOG.info( "cascading version: {}", jobConf.get( "cascading.version", "" ) );
087      LOG.info( "child jvm opts: {}", jobConf.get( "mapred.child.java.opts", "" ) );
088
089      currentProcess = new HadoopFlowProcess( new FlowSession(), jobConf, false );
090
091      timedIterators = TimedIterator.iterators( new TimedIterator<Tuple>( currentProcess, SliceCounters.Read_Duration, SliceCounters.Tuples_Read ) );
092
093      String reduceNodeState = jobConf.getRaw( "cascading.flow.step.node.reduce" );
094
095      if( reduceNodeState == null )
096        reduceNodeState = readStateFromDistCache( jobConf, jobConf.get( FlowStep.CASCADING_FLOW_STEP_ID ), "reduce" );
097
098      flowNode = deserializeBase64( reduceNodeState, jobConf, BaseFlowNode.class );
099
100      LOG.info( "flow node id: {}, ordinal: {}", flowNode.getID(), flowNode.getOrdinal() );
101
102      streamGraph = new HadoopReduceStreamGraph( currentProcess, flowNode, Util.getFirst( flowNode.getSourceElements() ) );
103
104      group = (HadoopGroupGate) streamGraph.getHeads().iterator().next();
105
106      for( Duct head : streamGraph.getHeads() )
107        LOG.info( "sourcing from: " + ( (ElementDuct) head ).getFlowElement() );
108
109      for( Duct tail : streamGraph.getTails() )
110        LOG.info( "sinking to: " + ( (ElementDuct) tail ).getFlowElement() );
111
112      for( Tap trap : flowNode.getTraps() )
113        LOG.info( "trapping to: " + trap );
114
115      logMemory( LOG, "flow node id: " + flowNode.getID() + ", mem on start" );
116      }
117    catch( Throwable throwable )
118      {
119      reportIfLocal( throwable );
120
121      if( throwable instanceof CascadingException )
122        throw (CascadingException) throwable;
123
124      throw new FlowException( "internal error during reducer configuration", throwable );
125      }
126    }
127
128  public void reduce( Object key, Iterator values, OutputCollector output, Reporter reporter ) throws IOException
129    {
130    currentProcess.setReporter( reporter );
131    currentProcess.setOutputCollector( output );
132
133    timedIterators[ 0 ].reset( values ); // allows us to count read tuples
134
135    if( !calledPrepare )
136      {
137      streamGraph.prepare();
138
139      calledPrepare = true;
140
141      processBeginTime = System.currentTimeMillis();
142      currentProcess.increment( SliceCounters.Process_Begin_Time, processBeginTime );
143      currentProcess.increment( StepCounters.Process_Begin_Time, processBeginTime );
144
145      group.start( group );
146      }
147
148    try
149      {
150      group.accept( (Tuple) key, timedIterators );
151      }
152    catch( StopDataNotificationException exception )
153      {
154      LogUtil.logWarnOnce( LOG, "received unsupported stop data notification, ignoring: {}", exception.getMessage() );
155      }
156    catch( OutOfMemoryError error )
157      {
158      throw error;
159      }
160    catch( Throwable throwable )
161      {
162      reportIfLocal( throwable );
163
164      if( throwable instanceof CascadingException )
165        throw (CascadingException) throwable;
166
167      throw new FlowException( "internal error during reducer execution", throwable );
168      }
169    }
170
171  @Override
172  public void close() throws IOException
173    {
174    try
175      {
176      if( calledPrepare )
177        {
178        group.complete( group );
179
180        streamGraph.cleanup();
181        }
182
183      super.close();
184      }
185    finally
186      {
187      if( currentProcess != null )
188        {
189        long processEndTime = System.currentTimeMillis();
190        currentProcess.increment( SliceCounters.Process_End_Time, processEndTime );
191        currentProcess.increment( SliceCounters.Process_Duration, processEndTime - processBeginTime );
192        currentProcess.increment( StepCounters.Process_End_Time, processEndTime );
193        currentProcess.increment( StepCounters.Process_Duration, processEndTime - processBeginTime );
194        }
195
196      String message = "flow node id: " + flowNode.getID();
197      logMemory( LOG, message + ", mem on close" );
198      logCounters( LOG, message + ", counter:", currentProcess );
199      }
200    }
201
202  /**
203   * Report the error to HadoopFlowStepJob if we are running in Hadoops local mode.
204   *
205   * @param throwable The throwable that was thrown.
206   */
207  private void reportIfLocal( Throwable throwable )
208    {
209    if( HadoopUtil.isLocal( currentProcess.getJobConf() ) )
210      HadoopFlowStepJob.reportLocalError( throwable );
211    }
212  }