001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.flow.tez; 022 023import java.io.IOException; 024import java.util.Collection; 025import java.util.HashSet; 026import java.util.Iterator; 027import java.util.List; 028import java.util.Map; 029 030import cascading.CascadingException; 031import cascading.flow.FlowElements; 032import cascading.flow.FlowException; 033import cascading.flow.FlowNode; 034import cascading.flow.FlowSession; 035import cascading.flow.SliceCounters; 036import cascading.flow.StepCounters; 037import cascading.flow.hadoop.util.HadoopUtil; 038import cascading.flow.planner.BaseFlowNode; 039import cascading.flow.stream.duct.Duct; 040import cascading.flow.stream.element.ElementDuct; 041import cascading.flow.stream.element.InputSource; 042import cascading.flow.tez.stream.graph.Hadoop2TezStreamGraph; 043import cascading.flow.tez.util.TezUtil; 044import cascading.tap.Tap; 045import cascading.util.Util; 046import org.apache.tez.common.TezUtils; 047import org.apache.tez.dag.api.TezConfiguration; 048import org.apache.tez.runtime.api.AbstractLogicalIOProcessor; 049import org.apache.tez.runtime.api.Event; 050import org.apache.tez.runtime.api.Input; 051import org.apache.tez.runtime.api.LogicalInput; 052import org.apache.tez.runtime.api.LogicalOutput; 053import org.apache.tez.runtime.api.ProcessorContext; 054import org.slf4j.Logger; 055import org.slf4j.LoggerFactory; 056 057import static cascading.flow.hadoop.util.HadoopUtil.deserializeBase64; 058import static cascading.util.LogUtil.logCounters; 059import static cascading.util.LogUtil.logMemory; 060 061/** 062 * 063 */ 064public class FlowProcessor extends AbstractLogicalIOProcessor 065 { 066 private static final Logger LOG = LoggerFactory.getLogger( FlowProcessor.class ); 067 068 private TezConfiguration configuration; 069 private Hadoop2TezFlowProcess currentProcess; 070 private FlowNode flowNode; 071 private Hadoop2TezStreamGraph streamGraph; 072 073 public FlowProcessor( ProcessorContext context ) 074 { 075 super( context ); 076 } 077 078 @Override 079 public void initialize() throws Exception 080 { 081 configuration = new TezConfiguration( TezUtils.createConfFromUserPayload( getContext().getUserPayload() ) ); 082 083 TezUtil.setMRProperties( getContext(), configuration, true ); 084 085 try 086 { 087 HadoopUtil.initLog4j( configuration ); 088 089 LOG.info( "cascading version: {}", configuration.get( "cascading.version", "" ) ); 090 091 currentProcess = new Hadoop2TezFlowProcess( new FlowSession(), getContext(), configuration ); 092 093 flowNode = deserializeBase64( configuration.getRaw( FlowNode.CASCADING_FLOW_NODE ), configuration, BaseFlowNode.class ); 094 095 LOG.info( "flow node id: {}, ordinal: {}", flowNode.getID(), flowNode.getOrdinal() ); 096 097 logMemory( LOG, "flow node id: " + flowNode.getID() + ", mem on start" ); 098 } 099 catch( Throwable throwable ) 100 { 101 if( throwable instanceof CascadingException ) 102 throw (CascadingException) throwable; 103 104 throw new FlowException( "internal error during processor configuration", throwable ); 105 } 106 } 107 108 @Override 109 public void run( Map<String, LogicalInput> inputMap, Map<String, LogicalOutput> outputMap ) throws Exception 110 { 111 Collection<Duct> allHeads; 112 InputSource streamedHead; 113 114 try 115 { 116 streamGraph = new Hadoop2TezStreamGraph( currentProcess, flowNode, inputMap, outputMap ); 117 118 allHeads = streamGraph.getHeads(); 119 streamedHead = streamGraph.getStreamedHead(); 120 121 for( Duct head : allHeads ) 122 LOG.info( "sourcing from: {} streamed: {}, id: {}", ( (ElementDuct) head ).getFlowElement(), head == streamedHead, FlowElements.id( ( (ElementDuct) head ).getFlowElement() ) ); 123 124 for( Duct tail : streamGraph.getTails() ) 125 LOG.info( "sinking to: {}, id: {}", ( (ElementDuct) tail ).getFlowElement(), FlowElements.id( ( (ElementDuct) tail ).getFlowElement() ) ); 126 127 for( Tap trap : flowNode.getTraps() ) 128 LOG.info( "trapping to: {}, id: {}", trap, FlowElements.id( trap ) ); 129 } 130 catch( Throwable throwable ) 131 { 132 if( throwable instanceof CascadingException ) 133 throw (CascadingException) throwable; 134 135 throw new FlowException( "internal error during processor configuration", throwable ); 136 } 137 138 streamGraph.prepare(); // starts inputs 139 140 // wait for shuffle 141 waitForInputsReady( inputMap ); 142 143 // user code begins executing from here 144 long processBeginTime = System.currentTimeMillis(); 145 146 currentProcess.increment( SliceCounters.Process_Begin_Time, processBeginTime ); 147 currentProcess.increment( StepCounters.Process_Begin_Time, processBeginTime ); 148 149 Iterator<Duct> iterator = allHeads.iterator(); 150 151 try 152 { 153 try 154 { 155 while( iterator.hasNext() ) 156 { 157 Duct next = iterator.next(); 158 159 if( next != streamedHead ) 160 { 161 ( (InputSource) next ).run( null ); 162 163 logMemory( LOG, "mem after accumulating source: " + ( (ElementDuct) next ).getFlowElement() + ", " ); 164 } 165 } 166 167 streamedHead.run( null ); 168 } 169 catch( OutOfMemoryError | IOException error ) 170 { 171 throw error; 172 } 173 catch( Throwable throwable ) 174 { 175 if( throwable instanceof CascadingException ) 176 throw (CascadingException) throwable; 177 178 throw new FlowException( "internal error during processor execution on node: " + flowNode.getOrdinal(), throwable ); 179 } 180 } 181 finally 182 { 183 try 184 { 185 streamGraph.cleanup(); 186 } 187 finally 188 { 189 long processEndTime = System.currentTimeMillis(); 190 currentProcess.increment( SliceCounters.Process_End_Time, processEndTime ); 191 currentProcess.increment( SliceCounters.Process_Duration, processEndTime - processBeginTime ); 192 currentProcess.increment( StepCounters.Process_End_Time, processEndTime ); 193 currentProcess.increment( StepCounters.Process_Duration, processEndTime - processBeginTime ); 194 } 195 } 196 } 197 198 protected void waitForInputsReady( Map<String, LogicalInput> inputMap ) throws InterruptedException 199 { 200 long beginInputReady = System.currentTimeMillis(); 201 202 HashSet<Input> inputs = new HashSet<Input>( inputMap.values() ); 203 204 getContext().waitForAllInputsReady( inputs ); 205 206 LOG.info( "flow node id: {}, all {} inputs ready in: {}", flowNode.getID(), inputs.size(), Util.formatDurationHMSms( System.currentTimeMillis() - beginInputReady ) ); 207 } 208 209 @Override 210 public void handleEvents( List<Event> events ) 211 { 212 LOG.debug( "in events" ); 213 } 214 215 @Override 216 public void close() throws Exception 217 { 218 String message = "flow node id: " + flowNode.getID(); 219 logMemory( LOG, message + ", mem on close" ); 220 logCounters( LOG, message + ", counter:", currentProcess ); 221 } 222 }