001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.flow.tez.stream.graph; 022 023import java.io.IOException; 024import java.util.Collection; 025import java.util.HashMap; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Map; 029import java.util.Set; 030 031import cascading.flow.FlowElement; 032import cascading.flow.FlowElements; 033import cascading.flow.FlowException; 034import cascading.flow.FlowNode; 035import cascading.flow.FlowProcess; 036import cascading.flow.Flows; 037import cascading.flow.hadoop.stream.HadoopMemoryJoinGate; 038import cascading.flow.hadoop.util.HadoopUtil; 039import cascading.flow.stream.annotations.StreamMode; 040import cascading.flow.stream.duct.Duct; 041import cascading.flow.stream.duct.Gate; 042import cascading.flow.stream.element.InputSource; 043import cascading.flow.stream.element.MemoryHashJoinGate; 044import cascading.flow.stream.element.SinkStage; 045import cascading.flow.stream.element.SourceStage; 046import cascading.flow.stream.graph.IORole; 047import cascading.flow.stream.graph.NodeStreamGraph; 048import cascading.flow.tez.Hadoop2TezFlowProcess; 049import cascading.flow.tez.stream.element.TezBoundaryStage; 050import cascading.flow.tez.stream.element.TezCoGroupGate; 051import cascading.flow.tez.stream.element.TezGroupByGate; 052import cascading.flow.tez.stream.element.TezMergeGate; 053import cascading.flow.tez.stream.element.TezSinkStage; 054import cascading.flow.tez.stream.element.TezSourceStage; 055import cascading.flow.tez.util.TezUtil; 056import cascading.pipe.Boundary; 057import cascading.pipe.CoGroup; 058import cascading.pipe.Group; 059import cascading.pipe.GroupBy; 060import cascading.pipe.HashJoin; 061import cascading.pipe.Merge; 062import cascading.pipe.Pipe; 063import cascading.tap.Tap; 064import cascading.util.SetMultiMap; 065import cascading.util.SortedListMultiMap; 066import cascading.util.Util; 067import org.apache.hadoop.conf.Configuration; 068import org.apache.tez.dag.api.TezConfiguration; 069import org.apache.tez.runtime.api.LogicalInput; 070import org.apache.tez.runtime.api.LogicalOutput; 071import org.slf4j.Logger; 072import org.slf4j.LoggerFactory; 073 074import static cascading.flow.tez.util.TezUtil.*; 075 076/** 077 * 078 */ 079public class Hadoop2TezStreamGraph extends NodeStreamGraph 080 { 081 private static final Logger LOG = LoggerFactory.getLogger( Hadoop2TezStreamGraph.class ); 082 083 private InputSource streamedHead; 084 private Map<String, LogicalInput> inputMap; 085 private Map<String, LogicalOutput> outputMap; 086 private Map<LogicalInput, Configuration> inputConfigMap = new HashMap<>(); 087 private Map<LogicalOutput, Configuration> outputConfigMap = new HashMap<>(); 088 private SetMultiMap<String, LogicalInput> inputMultiMap; 089 private SetMultiMap<String, LogicalOutput> outputMultiMap; 090 091 public Hadoop2TezStreamGraph( Hadoop2TezFlowProcess currentProcess, FlowNode flowNode, Map<String, LogicalInput> inputMap, Map<String, LogicalOutput> outputMap ) 092 { 093 super( currentProcess, flowNode ); 094 this.inputMap = inputMap; 095 this.outputMap = outputMap; 096 097 buildGraph(); 098 099 setTraps(); 100 setScopes(); 101 102 printGraph( node.getID(), node.getName(), flowProcess.getCurrentSliceNum() ); 103 104 bind(); 105 106 printBoundGraph( node.getID(), node.getName(), flowProcess.getCurrentSliceNum() ); 107 } 108 109 public InputSource getStreamedHead() 110 { 111 return streamedHead; 112 } 113 114 protected void buildGraph() 115 { 116 inputMultiMap = new SetMultiMap<>(); 117 118 for( Map.Entry<String, LogicalInput> entry : inputMap.entrySet() ) 119 { 120 Configuration inputConfiguration = getInputConfiguration( entry.getValue() ); 121 inputConfigMap.put( entry.getValue(), inputConfiguration ); 122 123 inputMultiMap.addAll( getEdgeSourceID( entry.getValue(), inputConfiguration ), entry.getValue() ); 124 } 125 126 outputMultiMap = new SetMultiMap<>(); 127 128 for( Map.Entry<String, LogicalOutput> entry : outputMap.entrySet() ) 129 { 130 Configuration outputConfiguration = getOutputConfiguration( entry.getValue() ); 131 outputConfigMap.put( entry.getValue(), outputConfiguration ); 132 133 outputMultiMap.addAll( TezUtil.getEdgeSinkID( entry.getValue(), outputConfiguration ), entry.getValue() ); 134 } 135 136 // this made the assumption we can have a physical and logical input per vertex. seems we can't 137 if( inputMultiMap.getKeys().size() == 1 ) 138 { 139 streamedSource = Flows.getFlowElementForID( node.getSourceElements(), Util.getFirst( inputMultiMap.getKeys() ) ); 140 } 141 else 142 { 143 Set<FlowElement> sourceElements = new HashSet<>( node.getSourceElements() ); 144 Set<? extends FlowElement> accumulated = node.getSourceElements( StreamMode.Accumulated ); 145 146 sourceElements.removeAll( accumulated ); 147 148 if( sourceElements.size() != 1 ) 149 throw new IllegalStateException( "too many input source keys, got: " + Util.join( sourceElements, ", " ) ); 150 151 streamedSource = Util.getFirst( sourceElements ); 152 } 153 154 LOG.info( "using streamed source: " + streamedSource ); 155 156 streamedHead = handleHead( streamedSource, flowProcess ); 157 158 Set<FlowElement> accumulated = new HashSet<>( node.getSourceElements() ); 159 160 accumulated.remove( streamedSource ); 161 162 Hadoop2TezFlowProcess tezProcess = (Hadoop2TezFlowProcess) flowProcess; 163 TezConfiguration conf = tezProcess.getConfiguration(); 164 165 for( FlowElement flowElement : accumulated ) 166 { 167 LOG.info( "using accumulated source: " + flowElement ); 168 169 if( flowElement instanceof Tap ) 170 { 171 Tap source = (Tap) flowElement; 172 173 // allows client side config to be used cluster side 174 String property = conf.getRaw( "cascading.node.accumulated.source.conf." + Tap.id( source ) ); 175 176 if( property == null ) 177 throw new IllegalStateException( "accumulated source conf property missing for: " + source.getIdentifier() ); 178 179 conf = getSourceConf( tezProcess, conf, property ); 180 } 181 else 182 { 183 conf = (TezConfiguration) inputConfigMap.get( FlowElements.id( flowElement ) ); 184 } 185 186 FlowProcess flowProcess = conf == null ? tezProcess : new Hadoop2TezFlowProcess( tezProcess, conf ); 187 188 handleHead( flowElement, flowProcess ); 189 } 190 } 191 192 private TezConfiguration getSourceConf( FlowProcess<TezConfiguration> flowProcess, TezConfiguration conf, String property ) 193 { 194 Map<String, String> priorConf; 195 196 try 197 { 198 priorConf = (Map<String, String>) HadoopUtil.deserializeBase64( property, conf, HashMap.class, true ); 199 } 200 catch( IOException exception ) 201 { 202 throw new FlowException( "unable to deserialize properties", exception ); 203 } 204 205 return flowProcess.mergeMapIntoConfig( conf, priorConf ); 206 } 207 208 private InputSource handleHead( FlowElement source, FlowProcess flowProcess ) 209 { 210 Duct sourceDuct; 211 212 if( source instanceof Tap ) 213 sourceDuct = createSourceStage( (Tap) source, flowProcess ); 214 else if( source instanceof Merge ) 215 sourceDuct = createMergeStage( (Merge) source, IORole.source ); 216 else if( source instanceof Boundary ) 217 sourceDuct = createBoundaryStage( (Boundary) source, IORole.source ); 218 else if( ( (Group) source ).isGroupBy() ) 219 sourceDuct = createGroupByGate( (GroupBy) source, IORole.source ); 220 else 221 sourceDuct = createCoGroupGate( (CoGroup) source, IORole.source ); 222 223 addHead( sourceDuct ); 224 225 handleDuct( source, sourceDuct ); 226 227 return (InputSource) sourceDuct; 228 } 229 230 protected SourceStage createSourceStage( Tap source, FlowProcess flowProcess ) 231 { 232 String id = Tap.id( source ); 233 LogicalInput logicalInput = inputMap.get( id ); 234 235 if( logicalInput == null ) 236 logicalInput = inputMap.get( flowProcess.getStringProperty( "cascading.node.source." + id ) ); 237 238 if( logicalInput == null ) 239 return new SourceStage( flowProcess, source ); 240 241 return new TezSourceStage( flowProcess, source, logicalInput ); 242 } 243 244 @Override 245 protected SinkStage createSinkStage( Tap sink ) 246 { 247 String id = Tap.id( sink ); 248 LogicalOutput logicalOutput = outputMap.get( id ); 249 250 if( logicalOutput == null ) 251 logicalOutput = outputMap.get( flowProcess.getStringProperty( "cascading.node.sink." + id ) ); 252 253 if( logicalOutput == null ) 254 throw new IllegalStateException( "could not find output for: " + sink ); 255 256 return new TezSinkStage( flowProcess, sink, logicalOutput ); 257 } 258 259 @Override 260 protected Duct createMergeStage( Merge element, IORole role ) 261 { 262 if( role == IORole.pass ) 263 return super.createMergeStage( element, IORole.pass ); 264 else if( role == IORole.sink ) 265 return createSinkMergeGate( element ); 266 else if( role == IORole.source ) 267 return createSourceMergeGate( element ); 268 else 269 throw new UnsupportedOperationException( "both role not supported with merge" ); 270 } 271 272 private Duct createSourceMergeGate( Merge element ) 273 { 274 return new TezMergeGate( flowProcess, element, IORole.source, createInputMap( element ) ); 275 } 276 277 private Duct createSinkMergeGate( Merge element ) 278 { 279 return new TezMergeGate( flowProcess, element, IORole.sink, findLogicalOutputs( element ) ); 280 } 281 282 @Override 283 protected Duct createBoundaryStage( Boundary element, IORole role ) 284 { 285 if( role == IORole.pass ) 286 return super.createBoundaryStage( element, IORole.pass ); 287 else if( role == IORole.sink ) 288 return createSinkBoundaryStage( element ); 289 else if( role == IORole.source ) 290 return createSourceBoundaryStage( element ); 291 else 292 throw new UnsupportedOperationException( "both role not supported with boundary" ); 293 } 294 295 private Duct createSourceBoundaryStage( Boundary element ) 296 { 297 return new TezBoundaryStage( flowProcess, element, IORole.source, findLogicalInput( element ) ); 298 } 299 300 private Duct createSinkBoundaryStage( Boundary element ) 301 { 302 return new TezBoundaryStage( flowProcess, element, IORole.sink, findLogicalOutputs( element ) ); 303 } 304 305 @Override 306 protected Gate createGroupByGate( GroupBy element, IORole role ) 307 { 308 if( role == IORole.sink ) 309 return createSinkGroupByGate( element ); 310 else 311 return createSourceGroupByGate( element ); 312 } 313 314 @Override 315 protected Gate createCoGroupGate( CoGroup element, IORole role ) 316 { 317 if( role == IORole.sink ) 318 return createSinkCoGroupByGate( element ); 319 else 320 return createSourceCoGroupByGate( element ); 321 } 322 323 private Gate createSinkCoGroupByGate( CoGroup element ) 324 { 325 return new TezCoGroupGate( flowProcess, element, IORole.sink, findLogicalOutput( element ) ); 326 } 327 328 private Gate createSourceCoGroupByGate( CoGroup element ) 329 { 330 return new TezCoGroupGate( flowProcess, element, IORole.source, createInputMap( element ) ); 331 } 332 333 protected Gate createSinkGroupByGate( GroupBy element ) 334 { 335 return new TezGroupByGate( flowProcess, element, IORole.sink, findLogicalOutput( element ) ); 336 } 337 338 protected Gate createSourceGroupByGate( GroupBy element ) 339 { 340 return new TezGroupByGate( flowProcess, element, IORole.source, createInputMap( element ) ); 341 } 342 343 private LogicalOutput findLogicalOutput( Pipe element ) 344 { 345 String id = Pipe.id( element ); 346 LogicalOutput logicalOutput = outputMap.get( id ); 347 348 if( logicalOutput == null ) 349 logicalOutput = outputMap.get( flowProcess.getStringProperty( "cascading.node.sink." + id ) ); 350 351 if( logicalOutput == null ) 352 throw new IllegalStateException( "could not find output for: " + element ); 353 354 return logicalOutput; 355 } 356 357 private Collection<LogicalOutput> findLogicalOutputs( Pipe element ) 358 { 359 String id = Pipe.id( element ); 360 361 return outputMultiMap.getValues( id ); 362 } 363 364 private LogicalInput findLogicalInput( Pipe element ) 365 { 366 String id = Pipe.id( element ); 367 LogicalInput logicalInput = inputMap.get( id ); 368 369 if( logicalInput == null ) 370 logicalInput = inputMap.get( flowProcess.getStringProperty( "cascading.node.source." + id ) ); 371 372 if( logicalInput == null ) 373 throw new IllegalStateException( "could not find input for: " + element ); 374 375 return logicalInput; 376 } 377 378 /** 379 * Maps each input to an ordinal on the flowelement. an input may be bound to multiple ordinals. 380 * 381 * @param element 382 */ 383 private SortedListMultiMap<Integer, LogicalInput> createInputMap( FlowElement element ) 384 { 385 String id = FlowElements.id( element ); 386 SortedListMultiMap<Integer, LogicalInput> ordinalMap = new SortedListMultiMap<>(); 387 388 for( LogicalInput logicalInput : inputMap.values() ) 389 { 390 Configuration configuration = inputConfigMap.get( logicalInput ); 391 392 String foundID = configuration.get( "cascading.node.source" ); 393 394 if( Util.isEmpty( foundID ) ) 395 throw new IllegalStateException( "cascading.node.source property not set on source LogicalInput" ); 396 397 if( !foundID.equals( id ) ) 398 continue; 399 400 String values = configuration.get( "cascading.node.ordinals", "" ); 401 List<Integer> ordinals = Util.split( Integer.class, ",", values ); 402 403 for( Integer ordinal : ordinals ) 404 ordinalMap.put( ordinal, logicalInput ); 405 } 406 407 return ordinalMap; 408 } 409 410 @Override 411 protected MemoryHashJoinGate createNonBlockingJoinGate( HashJoin join ) 412 { 413 return new HadoopMemoryJoinGate( flowProcess, join ); // does not use a latch 414 } 415 }