001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.flow.tez.stream.graph;
022
023import java.io.IOException;
024import java.util.Collection;
025import java.util.HashMap;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Map;
029import java.util.Set;
030
031import cascading.flow.FlowElement;
032import cascading.flow.FlowElements;
033import cascading.flow.FlowException;
034import cascading.flow.FlowNode;
035import cascading.flow.FlowProcess;
036import cascading.flow.Flows;
037import cascading.flow.hadoop.stream.HadoopMemoryJoinGate;
038import cascading.flow.hadoop.util.HadoopUtil;
039import cascading.flow.stream.annotations.StreamMode;
040import cascading.flow.stream.duct.Duct;
041import cascading.flow.stream.duct.Gate;
042import cascading.flow.stream.element.InputSource;
043import cascading.flow.stream.element.MemoryHashJoinGate;
044import cascading.flow.stream.element.SinkStage;
045import cascading.flow.stream.element.SourceStage;
046import cascading.flow.stream.graph.IORole;
047import cascading.flow.stream.graph.NodeStreamGraph;
048import cascading.flow.tez.Hadoop2TezFlowProcess;
049import cascading.flow.tez.stream.element.TezBoundaryStage;
050import cascading.flow.tez.stream.element.TezCoGroupGate;
051import cascading.flow.tez.stream.element.TezGroupByGate;
052import cascading.flow.tez.stream.element.TezMergeGate;
053import cascading.flow.tez.stream.element.TezSinkStage;
054import cascading.flow.tez.stream.element.TezSourceStage;
055import cascading.flow.tez.util.TezUtil;
056import cascading.pipe.Boundary;
057import cascading.pipe.CoGroup;
058import cascading.pipe.Group;
059import cascading.pipe.GroupBy;
060import cascading.pipe.HashJoin;
061import cascading.pipe.Merge;
062import cascading.pipe.Pipe;
063import cascading.tap.Tap;
064import cascading.util.SetMultiMap;
065import cascading.util.SortedListMultiMap;
066import cascading.util.Util;
067import org.apache.hadoop.conf.Configuration;
068import org.apache.tez.dag.api.TezConfiguration;
069import org.apache.tez.runtime.api.LogicalInput;
070import org.apache.tez.runtime.api.LogicalOutput;
071import org.slf4j.Logger;
072import org.slf4j.LoggerFactory;
073
074import static cascading.flow.tez.util.TezUtil.*;
075
076/**
077 *
078 */
079public class Hadoop2TezStreamGraph extends NodeStreamGraph
080  {
081  private static final Logger LOG = LoggerFactory.getLogger( Hadoop2TezStreamGraph.class );
082
083  private InputSource streamedHead;
084  private Map<String, LogicalInput> inputMap;
085  private Map<String, LogicalOutput> outputMap;
086  private Map<LogicalInput, Configuration> inputConfigMap = new HashMap<>();
087  private Map<LogicalOutput, Configuration> outputConfigMap = new HashMap<>();
088  private SetMultiMap<String, LogicalInput> inputMultiMap;
089  private SetMultiMap<String, LogicalOutput> outputMultiMap;
090
091  public Hadoop2TezStreamGraph( Hadoop2TezFlowProcess currentProcess, FlowNode flowNode, Map<String, LogicalInput> inputMap, Map<String, LogicalOutput> outputMap )
092    {
093    super( currentProcess, flowNode );
094    this.inputMap = inputMap;
095    this.outputMap = outputMap;
096
097    buildGraph();
098
099    setTraps();
100    setScopes();
101
102    printGraph( node.getID(), node.getName(), flowProcess.getCurrentSliceNum() );
103
104    bind();
105
106    printBoundGraph( node.getID(), node.getName(), flowProcess.getCurrentSliceNum() );
107    }
108
109  public InputSource getStreamedHead()
110    {
111    return streamedHead;
112    }
113
114  protected void buildGraph()
115    {
116    inputMultiMap = new SetMultiMap<>();
117
118    for( Map.Entry<String, LogicalInput> entry : inputMap.entrySet() )
119      {
120      Configuration inputConfiguration = getInputConfiguration( entry.getValue() );
121      inputConfigMap.put( entry.getValue(), inputConfiguration );
122
123      inputMultiMap.addAll( getEdgeSourceID( entry.getValue(), inputConfiguration ), entry.getValue() );
124      }
125
126    outputMultiMap = new SetMultiMap<>();
127
128    for( Map.Entry<String, LogicalOutput> entry : outputMap.entrySet() )
129      {
130      Configuration outputConfiguration = getOutputConfiguration( entry.getValue() );
131      outputConfigMap.put( entry.getValue(), outputConfiguration );
132
133      outputMultiMap.addAll( TezUtil.getEdgeSinkID( entry.getValue(), outputConfiguration ), entry.getValue() );
134      }
135
136    // this made the assumption we can have a physical and logical input per vertex. seems we can't
137    if( inputMultiMap.getKeys().size() == 1 )
138      {
139      streamedSource = Flows.getFlowElementForID( node.getSourceElements(), Util.getFirst( inputMultiMap.getKeys() ) );
140      }
141    else
142      {
143      Set<FlowElement> sourceElements = new HashSet<>( node.getSourceElements() );
144      Set<? extends FlowElement> accumulated = node.getSourceElements( StreamMode.Accumulated );
145
146      sourceElements.removeAll( accumulated );
147
148      if( sourceElements.size() != 1 )
149        throw new IllegalStateException( "too many input source keys, got: " + Util.join( sourceElements, ", " ) );
150
151      streamedSource = Util.getFirst( sourceElements );
152      }
153
154    LOG.info( "using streamed source: " + streamedSource );
155
156    streamedHead = handleHead( streamedSource, flowProcess );
157
158    Set<FlowElement> accumulated = new HashSet<>( node.getSourceElements() );
159
160    accumulated.remove( streamedSource );
161
162    Hadoop2TezFlowProcess tezProcess = (Hadoop2TezFlowProcess) flowProcess;
163    TezConfiguration conf = tezProcess.getConfiguration();
164
165    for( FlowElement flowElement : accumulated )
166      {
167      LOG.info( "using accumulated source: " + flowElement );
168
169      if( flowElement instanceof Tap )
170        {
171        Tap source = (Tap) flowElement;
172
173        // allows client side config to be used cluster side
174        String property = conf.getRaw( "cascading.node.accumulated.source.conf." + Tap.id( source ) );
175
176        if( property == null )
177          throw new IllegalStateException( "accumulated source conf property missing for: " + source.getIdentifier() );
178
179        conf = getSourceConf( tezProcess, conf, property );
180        }
181      else
182        {
183        conf = (TezConfiguration) inputConfigMap.get( FlowElements.id( flowElement ) );
184        }
185
186      FlowProcess flowProcess = conf == null ? tezProcess : new Hadoop2TezFlowProcess( tezProcess, conf );
187
188      handleHead( flowElement, flowProcess );
189      }
190    }
191
192  private TezConfiguration getSourceConf( FlowProcess<TezConfiguration> flowProcess, TezConfiguration conf, String property )
193    {
194    Map<String, String> priorConf;
195
196    try
197      {
198      priorConf = (Map<String, String>) HadoopUtil.deserializeBase64( property, conf, HashMap.class, true );
199      }
200    catch( IOException exception )
201      {
202      throw new FlowException( "unable to deserialize properties", exception );
203      }
204
205    return flowProcess.mergeMapIntoConfig( conf, priorConf );
206    }
207
208  private InputSource handleHead( FlowElement source, FlowProcess flowProcess )
209    {
210    Duct sourceDuct;
211
212    if( source instanceof Tap )
213      sourceDuct = createSourceStage( (Tap) source, flowProcess );
214    else if( source instanceof Merge )
215      sourceDuct = createMergeStage( (Merge) source, IORole.source );
216    else if( source instanceof Boundary )
217      sourceDuct = createBoundaryStage( (Boundary) source, IORole.source );
218    else if( ( (Group) source ).isGroupBy() )
219      sourceDuct = createGroupByGate( (GroupBy) source, IORole.source );
220    else
221      sourceDuct = createCoGroupGate( (CoGroup) source, IORole.source );
222
223    addHead( sourceDuct );
224
225    handleDuct( source, sourceDuct );
226
227    return (InputSource) sourceDuct;
228    }
229
230  protected SourceStage createSourceStage( Tap source, FlowProcess flowProcess )
231    {
232    String id = Tap.id( source );
233    LogicalInput logicalInput = inputMap.get( id );
234
235    if( logicalInput == null )
236      logicalInput = inputMap.get( flowProcess.getStringProperty( "cascading.node.source." + id ) );
237
238    if( logicalInput == null )
239      return new SourceStage( flowProcess, source );
240
241    return new TezSourceStage( flowProcess, source, logicalInput );
242    }
243
244  @Override
245  protected SinkStage createSinkStage( Tap sink )
246    {
247    String id = Tap.id( sink );
248    LogicalOutput logicalOutput = outputMap.get( id );
249
250    if( logicalOutput == null )
251      logicalOutput = outputMap.get( flowProcess.getStringProperty( "cascading.node.sink." + id ) );
252
253    if( logicalOutput == null )
254      throw new IllegalStateException( "could not find output for: " + sink );
255
256    return new TezSinkStage( flowProcess, sink, logicalOutput );
257    }
258
259  @Override
260  protected Duct createMergeStage( Merge element, IORole role )
261    {
262    if( role == IORole.pass )
263      return super.createMergeStage( element, IORole.pass );
264    else if( role == IORole.sink )
265      return createSinkMergeGate( element );
266    else if( role == IORole.source )
267      return createSourceMergeGate( element );
268    else
269      throw new UnsupportedOperationException( "both role not supported with merge" );
270    }
271
272  private Duct createSourceMergeGate( Merge element )
273    {
274    return new TezMergeGate( flowProcess, element, IORole.source, createInputMap( element ) );
275    }
276
277  private Duct createSinkMergeGate( Merge element )
278    {
279    return new TezMergeGate( flowProcess, element, IORole.sink, findLogicalOutputs( element ) );
280    }
281
282  @Override
283  protected Duct createBoundaryStage( Boundary element, IORole role )
284    {
285    if( role == IORole.pass )
286      return super.createBoundaryStage( element, IORole.pass );
287    else if( role == IORole.sink )
288      return createSinkBoundaryStage( element );
289    else if( role == IORole.source )
290      return createSourceBoundaryStage( element );
291    else
292      throw new UnsupportedOperationException( "both role not supported with boundary" );
293    }
294
295  private Duct createSourceBoundaryStage( Boundary element )
296    {
297    return new TezBoundaryStage( flowProcess, element, IORole.source, findLogicalInput( element ) );
298    }
299
300  private Duct createSinkBoundaryStage( Boundary element )
301    {
302    return new TezBoundaryStage( flowProcess, element, IORole.sink, findLogicalOutputs( element ) );
303    }
304
305  @Override
306  protected Gate createGroupByGate( GroupBy element, IORole role )
307    {
308    if( role == IORole.sink )
309      return createSinkGroupByGate( element );
310    else
311      return createSourceGroupByGate( element );
312    }
313
314  @Override
315  protected Gate createCoGroupGate( CoGroup element, IORole role )
316    {
317    if( role == IORole.sink )
318      return createSinkCoGroupByGate( element );
319    else
320      return createSourceCoGroupByGate( element );
321    }
322
323  private Gate createSinkCoGroupByGate( CoGroup element )
324    {
325    return new TezCoGroupGate( flowProcess, element, IORole.sink, findLogicalOutput( element ) );
326    }
327
328  private Gate createSourceCoGroupByGate( CoGroup element )
329    {
330    return new TezCoGroupGate( flowProcess, element, IORole.source, createInputMap( element ) );
331    }
332
333  protected Gate createSinkGroupByGate( GroupBy element )
334    {
335    return new TezGroupByGate( flowProcess, element, IORole.sink, findLogicalOutput( element ) );
336    }
337
338  protected Gate createSourceGroupByGate( GroupBy element )
339    {
340    return new TezGroupByGate( flowProcess, element, IORole.source, createInputMap( element ) );
341    }
342
343  private LogicalOutput findLogicalOutput( Pipe element )
344    {
345    String id = Pipe.id( element );
346    LogicalOutput logicalOutput = outputMap.get( id );
347
348    if( logicalOutput == null )
349      logicalOutput = outputMap.get( flowProcess.getStringProperty( "cascading.node.sink." + id ) );
350
351    if( logicalOutput == null )
352      throw new IllegalStateException( "could not find output for: " + element );
353
354    return logicalOutput;
355    }
356
357  private Collection<LogicalOutput> findLogicalOutputs( Pipe element )
358    {
359    String id = Pipe.id( element );
360
361    return outputMultiMap.getValues( id );
362    }
363
364  private LogicalInput findLogicalInput( Pipe element )
365    {
366    String id = Pipe.id( element );
367    LogicalInput logicalInput = inputMap.get( id );
368
369    if( logicalInput == null )
370      logicalInput = inputMap.get( flowProcess.getStringProperty( "cascading.node.source." + id ) );
371
372    if( logicalInput == null )
373      throw new IllegalStateException( "could not find input for: " + element );
374
375    return logicalInput;
376    }
377
378  /**
379   * Maps each input to an ordinal on the flowelement. an input may be bound to multiple ordinals.
380   *
381   * @param element
382   */
383  private SortedListMultiMap<Integer, LogicalInput> createInputMap( FlowElement element )
384    {
385    String id = FlowElements.id( element );
386    SortedListMultiMap<Integer, LogicalInput> ordinalMap = new SortedListMultiMap<>();
387
388    for( LogicalInput logicalInput : inputMap.values() )
389      {
390      Configuration configuration = inputConfigMap.get( logicalInput );
391
392      String foundID = configuration.get( "cascading.node.source" );
393
394      if( Util.isEmpty( foundID ) )
395        throw new IllegalStateException( "cascading.node.source property not set on source LogicalInput" );
396
397      if( !foundID.equals( id ) )
398        continue;
399
400      String values = configuration.get( "cascading.node.ordinals", "" );
401      List<Integer> ordinals = Util.split( Integer.class, ",", values );
402
403      for( Integer ordinal : ordinals )
404        ordinalMap.put( ordinal, logicalInput );
405      }
406
407    return ordinalMap;
408    }
409
410  @Override
411  protected MemoryHashJoinGate createNonBlockingJoinGate( HashJoin join )
412    {
413    return new HadoopMemoryJoinGate( flowProcess, join ); // does not use a latch
414    }
415  }