001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.flow.tez.planner; 022 023import cascading.flow.hadoop.planner.rule.transformer.ReplaceAccumulateTapWithDistCacheTransformer; 024import cascading.flow.planner.rule.RuleRegistry; 025import cascading.flow.planner.rule.annotator.LogicalMergeAnnotator; 026import cascading.flow.planner.rule.assertion.BufferAfterEveryAssert; 027import cascading.flow.planner.rule.assertion.EveryAfterBufferAssert; 028import cascading.flow.planner.rule.assertion.LoneGroupAssert; 029import cascading.flow.planner.rule.assertion.MissingGroupAssert; 030import cascading.flow.planner.rule.assertion.SplitBeforeEveryAssert; 031import cascading.flow.planner.rule.partitioner.WholeGraphStepPartitioner; 032import cascading.flow.planner.rule.transformer.ApplyAssertionLevelTransformer; 033import cascading.flow.planner.rule.transformer.ApplyDebugLevelTransformer; 034import cascading.flow.planner.rule.transformer.RemoveNoOpPipeTransformer; 035import cascading.flow.tez.planner.rule.annotator.AccumulatedPostNodeAnnotator; 036import cascading.flow.tez.planner.rule.assertion.DualStreamedAccumulatedMergeNodeAssert; 037import cascading.flow.tez.planner.rule.partitioner.BottomUpBoundariesNodePartitioner; 038import cascading.flow.tez.planner.rule.partitioner.BottomUpJoinedBoundariesNodePartitioner; 039import cascading.flow.tez.planner.rule.partitioner.BottomUpJoinedBoundariesTriangleNodePartitioner; 040import cascading.flow.tez.planner.rule.partitioner.ConsecutiveGroupOrMergesNodePartitioner; 041import cascading.flow.tez.planner.rule.partitioner.SplitJoinBoundariesNodeRePartitioner; 042import cascading.flow.tez.planner.rule.partitioner.StreamedAccumulatedBoundariesNodeRePartitioner; 043import cascading.flow.tez.planner.rule.partitioner.StreamedOnlySourcesNodeRePartitioner; 044import cascading.flow.tez.planner.rule.partitioner.TopDownSplitBoundariesNodePartitioner; 045import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceCheckpointTransformer; 046import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupBlockingHashJoinTransformer; 047import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupSplitHashJoinTransformer; 048import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupSplitSpliceTransformer; 049import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupSplitTransformer; 050import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceHashJoinSameSourceTransformer; 051import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceHashJoinToHashJoinTransformer; 052import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceJoinSplitTransformer; 053import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceSplitSplitToStreamedHashJoinTransformer; 054import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceSplitToStreamedHashJoinTransformer; 055import cascading.flow.tez.planner.rule.transformer.RemoveMalformedHashJoinNodeTransformer; 056 057/** 058 * The HashJoinHadoop2TezRuleRegistry provides support for assemblies using {@link cascading.pipe.HashJoin} pipes. 059 * <p> 060 * Detecting and optimizing for HashJoin pipes adds further complexity and time to converge on a valid physical plan. 061 * <p> 062 * If facing slowdowns, and no HashJoins are used, switch to the 063 * {@link cascading.flow.tez.planner.NoHashJoinHadoop2TezRuleRegistry} via the appropriate 064 * {@link cascading.flow.FlowConnector} constructor. 065 */ 066public class HashJoinHadoop2TezRuleRegistry extends RuleRegistry 067 { 068 public HashJoinHadoop2TezRuleRegistry() 069 { 070// enableDebugLogging(); 071 072 // PreBalance 073 addRule( new LoneGroupAssert() ); 074 addRule( new MissingGroupAssert() ); 075 addRule( new BufferAfterEveryAssert() ); 076 addRule( new EveryAfterBufferAssert() ); 077 addRule( new SplitBeforeEveryAssert() ); 078 079 addRule( new BoundaryBalanceGroupSplitTransformer() ); 080 addRule( new BoundaryBalanceGroupSplitSpliceTransformer() ); // prevents AssemblyHelpersPlatformTest#testSameSourceMerge deadlock 081 addRule( new BoundaryBalanceCheckpointTransformer() ); 082 083 // hash join 084 addRule( new BoundaryBalanceHashJoinSameSourceTransformer() ); 085 addRule( new BoundaryBalanceSplitToStreamedHashJoinTransformer() ); // testGroupBySplitGroupByJoin 086 addRule( new BoundaryBalanceSplitSplitToStreamedHashJoinTransformer() ); // testGroupBySplitSplitGroupByJoin 087 addRule( new BoundaryBalanceHashJoinToHashJoinTransformer() ); // force HJ into unique nodes 088 addRule( new BoundaryBalanceGroupBlockingHashJoinTransformer() ); // joinAfterEvery 089 addRule( new BoundaryBalanceGroupSplitHashJoinTransformer() ); // groupBySplitJoins 090 addRule( new BoundaryBalanceJoinSplitTransformer() ); // prevents duplication of HashJoin, testJoinSplit 091 092 // PreResolve 093 addRule( new RemoveNoOpPipeTransformer() ); 094 addRule( new ApplyAssertionLevelTransformer() ); 095 addRule( new ApplyDebugLevelTransformer() ); 096 addRule( new LogicalMergeAnnotator() ); // MergePipesPlatformTest#testSameSourceMergeHashJoin 097 addRule( new ReplaceAccumulateTapWithDistCacheTransformer() ); 098 099 // PostResolve 100 101 // PartitionSteps 102 addRule( new WholeGraphStepPartitioner() ); 103 104 // PostSteps 105 106 // PartitionNodes 107 108 // no match with HashJoin inclusion 109 addRule( new TopDownSplitBoundariesNodePartitioner() ); // split from source to multiple sinks 110 addRule( new ConsecutiveGroupOrMergesNodePartitioner() ); 111 addRule( new BottomUpBoundariesNodePartitioner() ); // streamed paths re-partitioned w/ StreamedOnly 112 addRule( new SplitJoinBoundariesNodeRePartitioner() ); // testCoGroupSelf - compensates for tez-1190 113 114 // hash join inclusion 115 addRule( new BottomUpJoinedBoundariesNodePartitioner() ); // will capture multiple inputs into sink for use with HashJoins 116 addRule( new BottomUpJoinedBoundariesTriangleNodePartitioner() ); // will capture multiple inputs into sink for use with HashJoins 117 addRule( new StreamedAccumulatedBoundariesNodeRePartitioner() ); // joinsIntoCoGroupLhs & groupBySplitJoins 118 addRule( new StreamedOnlySourcesNodeRePartitioner() ); 119 120 // PostNodes 121 addRule( new RemoveMalformedHashJoinNodeTransformer() ); // joinsIntoCoGroupLhs 122 addRule( new AccumulatedPostNodeAnnotator() ); // allows accumulated boundaries to be identified 123 124 addRule( new DualStreamedAccumulatedMergeNodeAssert() ); 125 } 126 }