001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.flow.hadoop.stream.graph; 022 023import java.io.IOException; 024import java.util.HashMap; 025import java.util.Map; 026import java.util.Set; 027 028import cascading.flow.FlowException; 029import cascading.flow.FlowNode; 030import cascading.flow.FlowProcess; 031import cascading.flow.hadoop.HadoopFlowProcess; 032import cascading.flow.hadoop.stream.HadoopMemoryJoinGate; 033import cascading.flow.hadoop.stream.element.HadoopCoGroupGate; 034import cascading.flow.hadoop.stream.element.HadoopGroupByGate; 035import cascading.flow.hadoop.stream.element.HadoopSinkStage; 036import cascading.flow.hadoop.util.HadoopUtil; 037import cascading.flow.planner.graph.ElementGraphs; 038import cascading.flow.stream.duct.Gate; 039import cascading.flow.stream.element.GroupingSpliceGate; 040import cascading.flow.stream.element.SinkStage; 041import cascading.flow.stream.element.SourceStage; 042import cascading.flow.stream.graph.IORole; 043import cascading.flow.stream.graph.NodeStreamGraph; 044import cascading.pipe.CoGroup; 045import cascading.pipe.GroupBy; 046import cascading.pipe.HashJoin; 047import cascading.tap.Tap; 048import org.apache.hadoop.mapred.JobConf; 049import org.apache.hadoop.mapred.Reporter; 050 051/** 052 * 053 */ 054public class HadoopMapStreamGraph extends NodeStreamGraph 055 { 056 private final Tap source; 057 private SourceStage streamedHead; 058 059 public HadoopMapStreamGraph( HadoopFlowProcess flowProcess, FlowNode node, Tap source ) 060 { 061 super( flowProcess, node, source ); 062 this.source = source; 063 064 buildGraph(); 065 066 setTraps(); 067 setScopes(); 068 069 printGraph( node.getID(), "map", flowProcess.getCurrentSliceNum() ); 070 071 bind(); 072 073 printBoundGraph( node.getID(), "map", flowProcess.getCurrentSliceNum() ); 074 } 075 076 public SourceStage getStreamedHead() 077 { 078 return streamedHead; 079 } 080 081 protected void buildGraph() 082 { 083 streamedHead = handleHead( this.source, flowProcess ); 084 085 Set<Tap> tributaries = ElementGraphs.findSources( elementGraph, Tap.class ); 086 087 tributaries.remove( this.source ); // we cannot stream and accumulate the same source 088 089 // accumulated paths 090 for( Object source : tributaries ) 091 { 092 final HadoopFlowProcess hadoopProcess = (HadoopFlowProcess) flowProcess; 093 JobConf conf = hadoopProcess.getJobConf(); 094 095 // allows client side config to be used cluster side 096 String property = conf.getRaw( "cascading.node.accumulated.source.conf." + Tap.id( (Tap) source ) ); 097 098 if( property == null ) 099 throw new IllegalStateException( "accumulated source conf property missing for: " + ( (Tap) source ).getIdentifier() ); 100 101 conf = getSourceConf( hadoopProcess, conf, property ); 102 103 // the reporter isn't provided until after the #run method is called 104 flowProcess = new HadoopFlowProcess( hadoopProcess, conf ) 105 { 106 @Override 107 public Reporter getReporter() 108 { 109 return hadoopProcess.getReporter(); 110 } 111 }; 112 113 handleHead( (Tap) source, flowProcess ); 114 } 115 } 116 117 private JobConf getSourceConf( HadoopFlowProcess flowProcess, JobConf conf, String property ) 118 { 119 Map<String, String> priorConf; 120 try 121 { 122 priorConf = (Map<String, String>) HadoopUtil.deserializeBase64( property, conf, HashMap.class, true ); 123 } 124 catch( IOException exception ) 125 { 126 throw new FlowException( "unable to deserialize properties", exception ); 127 } 128 129 return flowProcess.mergeMapIntoConfig( conf, priorConf ); 130 } 131 132 private SourceStage handleHead( Tap source, FlowProcess flowProcess ) 133 { 134 SourceStage sourceDuct = new SourceStage( flowProcess, source ); 135 136 addHead( sourceDuct ); 137 138 handleDuct( source, sourceDuct ); 139 140 return sourceDuct; 141 } 142 143 @Override 144 protected SinkStage createSinkStage( Tap element ) 145 { 146 return new HadoopSinkStage( flowProcess, element ); 147 } 148 149 @Override 150 protected Gate createCoGroupGate( CoGroup element, IORole role ) 151 { 152 return new HadoopCoGroupGate( flowProcess, element, IORole.sink ); 153 } 154 155 @Override 156 protected Gate createGroupByGate( GroupBy element, IORole role ) 157 { 158 return new HadoopGroupByGate( flowProcess, element, role ); 159 } 160 161 @Override 162 protected GroupingSpliceGate createNonBlockingJoinGate( HashJoin join ) 163 { 164 return new HadoopMemoryJoinGate( flowProcess, join ); // does not use a latch 165 } 166 }