001/* 002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.flow.hadoop; 023 024import java.io.IOException; 025import java.util.Iterator; 026 027import cascading.CascadingException; 028import cascading.flow.FlowException; 029import cascading.flow.FlowNode; 030import cascading.flow.FlowSession; 031import cascading.flow.FlowStep; 032import cascading.flow.SliceCounters; 033import cascading.flow.StepCounters; 034import cascading.flow.hadoop.planner.HadoopFlowStepJob; 035import cascading.flow.hadoop.stream.HadoopGroupGate; 036import cascading.flow.hadoop.stream.graph.HadoopReduceStreamGraph; 037import cascading.flow.hadoop.util.HadoopUtil; 038import cascading.flow.hadoop.util.TimedIterator; 039import cascading.flow.planner.BaseFlowNode; 040import cascading.flow.stream.StopDataNotificationException; 041import cascading.flow.stream.duct.Duct; 042import cascading.flow.stream.element.ElementDuct; 043import cascading.tap.Tap; 044import cascading.tuple.Tuple; 045import cascading.util.LogUtil; 046import cascading.util.Util; 047import org.apache.hadoop.mapred.JobConf; 048import org.apache.hadoop.mapred.MapReduceBase; 049import org.apache.hadoop.mapred.OutputCollector; 050import org.apache.hadoop.mapred.Reducer; 051import org.apache.hadoop.mapred.Reporter; 052import org.slf4j.Logger; 053import org.slf4j.LoggerFactory; 054 055import static cascading.flow.hadoop.util.HadoopMRUtil.readStateFromDistCache; 056import static cascading.flow.hadoop.util.HadoopUtil.deserializeBase64; 057import static cascading.util.LogUtil.logCounters; 058import static cascading.util.LogUtil.logMemory; 059 060/** Class FlowReducer is the Hadoop Reducer implementation. */ 061public class FlowReducer extends MapReduceBase implements Reducer 062 { 063 private static final Logger LOG = LoggerFactory.getLogger( FlowReducer.class ); 064 065 private FlowNode flowNode; 066 private HadoopReduceStreamGraph streamGraph; 067 private HadoopFlowProcess currentProcess; 068 private TimedIterator<Tuple>[] timedIterators; 069 070 private boolean calledPrepare = false; 071 private HadoopGroupGate group; 072 private long processBeginTime; 073 074 /** Constructor FlowReducer creates a new FlowReducer instance. */ 075 public FlowReducer() 076 { 077 } 078 079 @Override 080 public void configure( JobConf jobConf ) 081 { 082 try 083 { 084 super.configure( jobConf ); 085 HadoopUtil.initLog4j( jobConf ); 086 087 LOG.info( "cascading version: {}", jobConf.get( "cascading.version", "" ) ); 088 LOG.info( "child jvm opts: {}", jobConf.get( "mapred.child.java.opts", "" ) ); 089 090 currentProcess = new HadoopFlowProcess( new FlowSession(), jobConf, false ); 091 092 timedIterators = TimedIterator.iterators( new TimedIterator<Tuple>( currentProcess, SliceCounters.Read_Duration, SliceCounters.Tuples_Read ) ); 093 094 String reduceNodeState = jobConf.getRaw( "cascading.flow.step.node.reduce" ); 095 096 if( reduceNodeState == null ) 097 reduceNodeState = readStateFromDistCache( jobConf, jobConf.get( FlowStep.CASCADING_FLOW_STEP_ID ), "reduce" ); 098 099 flowNode = deserializeBase64( reduceNodeState, jobConf, BaseFlowNode.class ); 100 101 LOG.info( "flow node id: {}, ordinal: {}", flowNode.getID(), flowNode.getOrdinal() ); 102 103 streamGraph = new HadoopReduceStreamGraph( currentProcess, flowNode, Util.getFirst( flowNode.getSourceElements() ) ); 104 105 group = (HadoopGroupGate) streamGraph.getHeads().iterator().next(); 106 107 for( Duct head : streamGraph.getHeads() ) 108 LOG.info( "sourcing from: " + ( (ElementDuct) head ).getFlowElement() ); 109 110 for( Duct tail : streamGraph.getTails() ) 111 LOG.info( "sinking to: " + ( (ElementDuct) tail ).getFlowElement() ); 112 113 for( Tap trap : flowNode.getTraps() ) 114 LOG.info( "trapping to: " + trap ); 115 116 logMemory( LOG, "flow node id: " + flowNode.getID() + ", mem on start" ); 117 } 118 catch( Throwable throwable ) 119 { 120 reportIfLocal( throwable ); 121 122 if( throwable instanceof CascadingException ) 123 throw (CascadingException) throwable; 124 125 throw new FlowException( "internal error during reducer configuration", throwable ); 126 } 127 } 128 129 public void reduce( Object key, Iterator values, OutputCollector output, Reporter reporter ) throws IOException 130 { 131 currentProcess.setReporter( reporter ); 132 currentProcess.setOutputCollector( output ); 133 134 timedIterators[ 0 ].reset( values ); // allows us to count read tuples 135 136 if( !calledPrepare ) 137 { 138 streamGraph.prepare(); 139 140 calledPrepare = true; 141 142 processBeginTime = System.currentTimeMillis(); 143 currentProcess.increment( SliceCounters.Process_Begin_Time, processBeginTime ); 144 currentProcess.increment( StepCounters.Process_Begin_Time, processBeginTime ); 145 146 group.start( group ); 147 } 148 149 try 150 { 151 group.accept( (Tuple) key, timedIterators ); 152 } 153 catch( StopDataNotificationException exception ) 154 { 155 LogUtil.logWarnOnce( LOG, "received unsupported stop data notification, ignoring: {}", exception.getMessage() ); 156 } 157 catch( OutOfMemoryError error ) 158 { 159 throw error; 160 } 161 catch( Throwable throwable ) 162 { 163 reportIfLocal( throwable ); 164 165 if( throwable instanceof CascadingException ) 166 throw (CascadingException) throwable; 167 168 throw new FlowException( "internal error during reducer execution", throwable ); 169 } 170 } 171 172 @Override 173 public void close() throws IOException 174 { 175 try 176 { 177 if( calledPrepare ) 178 { 179 group.complete( group ); 180 181 streamGraph.cleanup(); 182 } 183 184 super.close(); 185 } 186 finally 187 { 188 if( currentProcess != null ) 189 { 190 long processEndTime = System.currentTimeMillis(); 191 currentProcess.increment( SliceCounters.Process_End_Time, processEndTime ); 192 currentProcess.increment( SliceCounters.Process_Duration, processEndTime - processBeginTime ); 193 currentProcess.increment( StepCounters.Process_End_Time, processEndTime ); 194 currentProcess.increment( StepCounters.Process_Duration, processEndTime - processBeginTime ); 195 } 196 197 String message = "flow node id: " + flowNode.getID(); 198 logMemory( LOG, message + ", mem on close" ); 199 logCounters( LOG, message + ", counter:", currentProcess ); 200 } 201 } 202 203 /** 204 * Report the error to HadoopFlowStepJob if we are running in Hadoops local mode. 205 * 206 * @param throwable The throwable that was thrown. 207 */ 208 private void reportIfLocal( Throwable throwable ) 209 { 210 if( HadoopUtil.isLocal( currentProcess.getJobConf() ) ) 211 HadoopFlowStepJob.reportLocalError( throwable ); 212 } 213 }