001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.flow.hadoop.stream.graph;
022
023import java.io.IOException;
024import java.util.HashMap;
025import java.util.Map;
026import java.util.Set;
027
028import cascading.flow.FlowException;
029import cascading.flow.FlowNode;
030import cascading.flow.FlowProcess;
031import cascading.flow.hadoop.HadoopFlowProcess;
032import cascading.flow.hadoop.stream.HadoopMemoryJoinGate;
033import cascading.flow.hadoop.stream.element.HadoopCoGroupGate;
034import cascading.flow.hadoop.stream.element.HadoopGroupByGate;
035import cascading.flow.hadoop.stream.element.HadoopSinkStage;
036import cascading.flow.hadoop.util.HadoopUtil;
037import cascading.flow.planner.graph.ElementGraphs;
038import cascading.flow.stream.duct.Gate;
039import cascading.flow.stream.element.GroupingSpliceGate;
040import cascading.flow.stream.element.SinkStage;
041import cascading.flow.stream.element.SourceStage;
042import cascading.flow.stream.graph.IORole;
043import cascading.flow.stream.graph.NodeStreamGraph;
044import cascading.pipe.CoGroup;
045import cascading.pipe.GroupBy;
046import cascading.pipe.HashJoin;
047import cascading.tap.Tap;
048import org.apache.hadoop.mapred.JobConf;
049import org.apache.hadoop.mapred.Reporter;
050
051/**
052 *
053 */
054public class HadoopMapStreamGraph extends NodeStreamGraph
055  {
056  private final Tap source;
057  private SourceStage streamedHead;
058
059  public HadoopMapStreamGraph( HadoopFlowProcess flowProcess, FlowNode node, Tap source )
060    {
061    super( flowProcess, node, source );
062    this.source = source;
063
064    buildGraph();
065
066    setTraps();
067    setScopes();
068
069    printGraph( node.getID(), "map", flowProcess.getCurrentSliceNum() );
070
071    bind();
072
073    printBoundGraph( node.getID(), "map", flowProcess.getCurrentSliceNum() );
074    }
075
076  public SourceStage getStreamedHead()
077    {
078    return streamedHead;
079    }
080
081  protected void buildGraph()
082    {
083    streamedHead = handleHead( this.source, flowProcess );
084
085    Set<Tap> tributaries = ElementGraphs.findSources( elementGraph, Tap.class );
086
087    tributaries.remove( this.source ); // we cannot stream and accumulate the same source
088
089    // accumulated paths
090    for( Object source : tributaries )
091      {
092      final HadoopFlowProcess hadoopProcess = (HadoopFlowProcess) flowProcess;
093      JobConf conf = hadoopProcess.getJobConf();
094
095      // allows client side config to be used cluster side
096      String property = conf.getRaw( "cascading.node.accumulated.source.conf." + Tap.id( (Tap) source ) );
097
098      if( property == null )
099        throw new IllegalStateException( "accumulated source conf property missing for: " + ( (Tap) source ).getIdentifier() );
100
101      conf = getSourceConf( hadoopProcess, conf, property );
102
103      // the reporter isn't provided until after the #run method is called
104      flowProcess = new HadoopFlowProcess( hadoopProcess, conf )
105        {
106        @Override
107        public Reporter getReporter()
108          {
109          return hadoopProcess.getReporter();
110          }
111        };
112
113      handleHead( (Tap) source, flowProcess );
114      }
115    }
116
117  private JobConf getSourceConf( HadoopFlowProcess flowProcess, JobConf conf, String property )
118    {
119    Map<String, String> priorConf;
120    try
121      {
122      priorConf = (Map<String, String>) HadoopUtil.deserializeBase64( property, conf, HashMap.class, true );
123      }
124    catch( IOException exception )
125      {
126      throw new FlowException( "unable to deserialize properties", exception );
127      }
128
129    return flowProcess.mergeMapIntoConfig( conf, priorConf );
130    }
131
132  private SourceStage handleHead( Tap source, FlowProcess flowProcess )
133    {
134    SourceStage sourceDuct = new SourceStage( flowProcess, source );
135
136    addHead( sourceDuct );
137
138    handleDuct( source, sourceDuct );
139
140    return sourceDuct;
141    }
142
143  @Override
144  protected SinkStage createSinkStage( Tap element )
145    {
146    return new HadoopSinkStage( flowProcess, element );
147    }
148
149  @Override
150  protected Gate createCoGroupGate( CoGroup element, IORole role )
151    {
152    return new HadoopCoGroupGate( flowProcess, element, IORole.sink );
153    }
154
155  @Override
156  protected Gate createGroupByGate( GroupBy element, IORole role )
157    {
158    return new HadoopGroupByGate( flowProcess, element, role );
159    }
160
161  @Override
162  protected GroupingSpliceGate createNonBlockingJoinGate( HashJoin join )
163    {
164    return new HadoopMemoryJoinGate( flowProcess, join ); // does not use a latch
165    }
166  }