001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.flow.hadoop; 022 023import java.io.IOException; 024import java.util.Iterator; 025 026import cascading.CascadingException; 027import cascading.flow.FlowException; 028import cascading.flow.FlowNode; 029import cascading.flow.FlowSession; 030import cascading.flow.FlowStep; 031import cascading.flow.SliceCounters; 032import cascading.flow.StepCounters; 033import cascading.flow.hadoop.planner.HadoopFlowStepJob; 034import cascading.flow.hadoop.stream.HadoopGroupGate; 035import cascading.flow.hadoop.stream.graph.HadoopReduceStreamGraph; 036import cascading.flow.hadoop.util.HadoopUtil; 037import cascading.flow.hadoop.util.TimedIterator; 038import cascading.flow.planner.BaseFlowNode; 039import cascading.flow.stream.StopDataNotificationException; 040import cascading.flow.stream.duct.Duct; 041import cascading.flow.stream.element.ElementDuct; 042import cascading.tap.Tap; 043import cascading.tuple.Tuple; 044import cascading.util.LogUtil; 045import cascading.util.Util; 046import org.apache.hadoop.mapred.JobConf; 047import org.apache.hadoop.mapred.MapReduceBase; 048import org.apache.hadoop.mapred.OutputCollector; 049import org.apache.hadoop.mapred.Reducer; 050import org.apache.hadoop.mapred.Reporter; 051import org.slf4j.Logger; 052import org.slf4j.LoggerFactory; 053 054import static cascading.flow.hadoop.util.HadoopMRUtil.readStateFromDistCache; 055import static cascading.flow.hadoop.util.HadoopUtil.deserializeBase64; 056import static cascading.util.LogUtil.logCounters; 057import static cascading.util.LogUtil.logMemory; 058 059/** Class FlowReducer is the Hadoop Reducer implementation. */ 060public class FlowReducer extends MapReduceBase implements Reducer 061 { 062 private static final Logger LOG = LoggerFactory.getLogger( FlowReducer.class ); 063 064 private FlowNode flowNode; 065 private HadoopReduceStreamGraph streamGraph; 066 private HadoopFlowProcess currentProcess; 067 private TimedIterator<Tuple>[] timedIterators; 068 069 private boolean calledPrepare = false; 070 private HadoopGroupGate group; 071 private long processBeginTime; 072 073 /** Constructor FlowReducer creates a new FlowReducer instance. */ 074 public FlowReducer() 075 { 076 } 077 078 @Override 079 public void configure( JobConf jobConf ) 080 { 081 try 082 { 083 super.configure( jobConf ); 084 HadoopUtil.initLog4j( jobConf ); 085 086 LOG.info( "cascading version: {}", jobConf.get( "cascading.version", "" ) ); 087 LOG.info( "child jvm opts: {}", jobConf.get( "mapred.child.java.opts", "" ) ); 088 089 currentProcess = new HadoopFlowProcess( new FlowSession(), jobConf, false ); 090 091 timedIterators = TimedIterator.iterators( new TimedIterator<Tuple>( currentProcess, SliceCounters.Read_Duration, SliceCounters.Tuples_Read ) ); 092 093 String reduceNodeState = jobConf.getRaw( "cascading.flow.step.node.reduce" ); 094 095 if( reduceNodeState == null ) 096 reduceNodeState = readStateFromDistCache( jobConf, jobConf.get( FlowStep.CASCADING_FLOW_STEP_ID ), "reduce" ); 097 098 flowNode = deserializeBase64( reduceNodeState, jobConf, BaseFlowNode.class ); 099 100 LOG.info( "flow node id: {}, ordinal: {}", flowNode.getID(), flowNode.getOrdinal() ); 101 102 streamGraph = new HadoopReduceStreamGraph( currentProcess, flowNode, Util.getFirst( flowNode.getSourceElements() ) ); 103 104 group = (HadoopGroupGate) streamGraph.getHeads().iterator().next(); 105 106 for( Duct head : streamGraph.getHeads() ) 107 LOG.info( "sourcing from: " + ( (ElementDuct) head ).getFlowElement() ); 108 109 for( Duct tail : streamGraph.getTails() ) 110 LOG.info( "sinking to: " + ( (ElementDuct) tail ).getFlowElement() ); 111 112 for( Tap trap : flowNode.getTraps() ) 113 LOG.info( "trapping to: " + trap ); 114 115 logMemory( LOG, "flow node id: " + flowNode.getID() + ", mem on start" ); 116 } 117 catch( Throwable throwable ) 118 { 119 reportIfLocal( throwable ); 120 121 if( throwable instanceof CascadingException ) 122 throw (CascadingException) throwable; 123 124 throw new FlowException( "internal error during reducer configuration", throwable ); 125 } 126 } 127 128 public void reduce( Object key, Iterator values, OutputCollector output, Reporter reporter ) throws IOException 129 { 130 currentProcess.setReporter( reporter ); 131 currentProcess.setOutputCollector( output ); 132 133 timedIterators[ 0 ].reset( values ); // allows us to count read tuples 134 135 if( !calledPrepare ) 136 { 137 streamGraph.prepare(); 138 139 calledPrepare = true; 140 141 processBeginTime = System.currentTimeMillis(); 142 currentProcess.increment( SliceCounters.Process_Begin_Time, processBeginTime ); 143 currentProcess.increment( StepCounters.Process_Begin_Time, processBeginTime ); 144 145 group.start( group ); 146 } 147 148 try 149 { 150 group.accept( (Tuple) key, timedIterators ); 151 } 152 catch( StopDataNotificationException exception ) 153 { 154 LogUtil.logWarnOnce( LOG, "received unsupported stop data notification, ignoring: {}", exception.getMessage() ); 155 } 156 catch( OutOfMemoryError error ) 157 { 158 throw error; 159 } 160 catch( Throwable throwable ) 161 { 162 reportIfLocal( throwable ); 163 164 if( throwable instanceof CascadingException ) 165 throw (CascadingException) throwable; 166 167 throw new FlowException( "internal error during reducer execution", throwable ); 168 } 169 } 170 171 @Override 172 public void close() throws IOException 173 { 174 try 175 { 176 if( calledPrepare ) 177 { 178 group.complete( group ); 179 180 streamGraph.cleanup(); 181 } 182 183 super.close(); 184 } 185 finally 186 { 187 if( currentProcess != null ) 188 { 189 long processEndTime = System.currentTimeMillis(); 190 currentProcess.increment( SliceCounters.Process_End_Time, processEndTime ); 191 currentProcess.increment( SliceCounters.Process_Duration, processEndTime - processBeginTime ); 192 currentProcess.increment( StepCounters.Process_End_Time, processEndTime ); 193 currentProcess.increment( StepCounters.Process_Duration, processEndTime - processBeginTime ); 194 } 195 196 String message = "flow node id: " + flowNode.getID(); 197 logMemory( LOG, message + ", mem on close" ); 198 logCounters( LOG, message + ", counter:", currentProcess ); 199 } 200 } 201 202 /** 203 * Report the error to HadoopFlowStepJob if we are running in Hadoops local mode. 204 * 205 * @param throwable The throwable that was thrown. 206 */ 207 private void reportIfLocal( Throwable throwable ) 208 { 209 if( HadoopUtil.isLocal( currentProcess.getJobConf() ) ) 210 HadoopFlowStepJob.reportLocalError( throwable ); 211 } 212 }