001/*
002 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.flow.tez;
022
023import java.io.IOException;
024import java.util.Collection;
025import java.util.HashSet;
026import java.util.Iterator;
027import java.util.List;
028import java.util.Map;
029
030import cascading.CascadingException;
031import cascading.flow.FlowElements;
032import cascading.flow.FlowException;
033import cascading.flow.FlowNode;
034import cascading.flow.FlowSession;
035import cascading.flow.SliceCounters;
036import cascading.flow.StepCounters;
037import cascading.flow.hadoop.util.HadoopUtil;
038import cascading.flow.planner.BaseFlowNode;
039import cascading.flow.stream.duct.Duct;
040import cascading.flow.stream.element.ElementDuct;
041import cascading.flow.stream.element.InputSource;
042import cascading.flow.tez.stream.graph.Hadoop2TezStreamGraph;
043import cascading.flow.tez.util.TezUtil;
044import cascading.tap.Tap;
045import cascading.util.Util;
046import org.apache.tez.common.TezUtils;
047import org.apache.tez.dag.api.TezConfiguration;
048import org.apache.tez.runtime.api.AbstractLogicalIOProcessor;
049import org.apache.tez.runtime.api.Event;
050import org.apache.tez.runtime.api.Input;
051import org.apache.tez.runtime.api.LogicalInput;
052import org.apache.tez.runtime.api.LogicalOutput;
053import org.apache.tez.runtime.api.ProcessorContext;
054import org.slf4j.Logger;
055import org.slf4j.LoggerFactory;
056
057import static cascading.flow.hadoop.util.HadoopUtil.deserializeBase64;
058import static cascading.util.LogUtil.logCounters;
059import static cascading.util.LogUtil.logMemory;
060
061/**
062 *
063 */
064public class FlowProcessor extends AbstractLogicalIOProcessor
065  {
066  private static final Logger LOG = LoggerFactory.getLogger( FlowProcessor.class );
067
068  private TezConfiguration configuration;
069  private Hadoop2TezFlowProcess currentProcess;
070  private FlowNode flowNode;
071  private Hadoop2TezStreamGraph streamGraph;
072
073  public FlowProcessor( ProcessorContext context )
074    {
075    super( context );
076    }
077
078  @Override
079  public void initialize() throws Exception
080    {
081    configuration = new TezConfiguration( TezUtils.createConfFromUserPayload( getContext().getUserPayload() ) );
082
083    TezUtil.setMRProperties( getContext(), configuration, true );
084
085    try
086      {
087      HadoopUtil.initLog4j( configuration );
088
089      LOG.info( "cascading version: {}", configuration.get( "cascading.version", "" ) );
090
091      currentProcess = new Hadoop2TezFlowProcess( new FlowSession(), getContext(), configuration );
092
093      flowNode = deserializeBase64( configuration.getRaw( FlowNode.CASCADING_FLOW_NODE ), configuration, BaseFlowNode.class );
094
095      LOG.info( "flow node id: {}, ordinal: {}", flowNode.getID(), flowNode.getOrdinal() );
096
097      logMemory( LOG, "flow node id: " + flowNode.getID() + ", mem on start" );
098      }
099    catch( Throwable throwable )
100      {
101      if( throwable instanceof CascadingException )
102        throw (CascadingException) throwable;
103
104      throw new FlowException( "internal error during processor configuration", throwable );
105      }
106    }
107
108  @Override
109  public void run( Map<String, LogicalInput> inputMap, Map<String, LogicalOutput> outputMap ) throws Exception
110    {
111    Collection<Duct> allHeads;
112    InputSource streamedHead;
113
114    try
115      {
116      streamGraph = new Hadoop2TezStreamGraph( currentProcess, flowNode, inputMap, outputMap );
117
118      allHeads = streamGraph.getHeads();
119      streamedHead = streamGraph.getStreamedHead();
120
121      for( Duct head : allHeads )
122        LOG.info( "sourcing from: {} streamed: {}, id: {}", ( (ElementDuct) head ).getFlowElement(), head == streamedHead, FlowElements.id( ( (ElementDuct) head ).getFlowElement() ) );
123
124      for( Duct tail : streamGraph.getTails() )
125        LOG.info( "sinking to: {}, id: {}", ( (ElementDuct) tail ).getFlowElement(), FlowElements.id( ( (ElementDuct) tail ).getFlowElement() ) );
126
127      for( Tap trap : flowNode.getTraps() )
128        LOG.info( "trapping to: {}, id: {}", trap, FlowElements.id( trap ) );
129      }
130    catch( Throwable throwable )
131      {
132      if( throwable instanceof CascadingException )
133        throw (CascadingException) throwable;
134
135      throw new FlowException( "internal error during processor configuration", throwable );
136      }
137
138    streamGraph.prepare(); // starts inputs
139
140    // wait for shuffle
141    waitForInputsReady( inputMap );
142
143    // user code begins executing from here
144    long processBeginTime = System.currentTimeMillis();
145
146    currentProcess.increment( SliceCounters.Process_Begin_Time, processBeginTime );
147    currentProcess.increment( StepCounters.Process_Begin_Time, processBeginTime );
148
149    Iterator<Duct> iterator = allHeads.iterator();
150
151    try
152      {
153      try
154        {
155        while( iterator.hasNext() )
156          {
157          Duct next = iterator.next();
158
159          if( next != streamedHead )
160            {
161            ( (InputSource) next ).run( null );
162
163            logMemory( LOG, "mem after accumulating source: " + ( (ElementDuct) next ).getFlowElement() + ", " );
164            }
165          }
166
167        streamedHead.run( null );
168        }
169      catch( OutOfMemoryError | IOException error )
170        {
171        throw error;
172        }
173      catch( Throwable throwable )
174        {
175        if( throwable instanceof CascadingException )
176          throw (CascadingException) throwable;
177
178        throw new FlowException( "internal error during processor execution on node: " + flowNode.getOrdinal(), throwable );
179        }
180      }
181    finally
182      {
183      try
184        {
185        streamGraph.cleanup();
186        }
187      finally
188        {
189        long processEndTime = System.currentTimeMillis();
190        currentProcess.increment( SliceCounters.Process_End_Time, processEndTime );
191        currentProcess.increment( SliceCounters.Process_Duration, processEndTime - processBeginTime );
192        currentProcess.increment( StepCounters.Process_End_Time, processEndTime );
193        currentProcess.increment( StepCounters.Process_Duration, processEndTime - processBeginTime );
194        }
195      }
196    }
197
198  protected void waitForInputsReady( Map<String, LogicalInput> inputMap ) throws InterruptedException
199    {
200    long beginInputReady = System.currentTimeMillis();
201
202    HashSet<Input> inputs = new HashSet<Input>( inputMap.values() );
203
204    getContext().waitForAllInputsReady( inputs );
205
206    LOG.info( "flow node id: {}, all {} inputs ready in: {}", flowNode.getID(), inputs.size(), Util.formatDurationHMSms( System.currentTimeMillis() - beginInputReady ) );
207    }
208
209  @Override
210  public void handleEvents( List<Event> events )
211    {
212    LOG.debug( "in events" );
213    }
214
215  @Override
216  public void close() throws Exception
217    {
218    String message = "flow node id: " + flowNode.getID();
219    logMemory( LOG, message + ", mem on close" );
220    logCounters( LOG, message + ", counter:", currentProcess );
221    }
222  }