001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.flow.tez.planner;
022
023import cascading.flow.hadoop.planner.rule.transformer.ReplaceAccumulateTapWithDistCacheTransformer;
024import cascading.flow.planner.rule.RuleRegistry;
025import cascading.flow.planner.rule.annotator.LogicalMergeAnnotator;
026import cascading.flow.planner.rule.assertion.BufferAfterEveryAssert;
027import cascading.flow.planner.rule.assertion.EveryAfterBufferAssert;
028import cascading.flow.planner.rule.assertion.LoneGroupAssert;
029import cascading.flow.planner.rule.assertion.MissingGroupAssert;
030import cascading.flow.planner.rule.assertion.SplitBeforeEveryAssert;
031import cascading.flow.planner.rule.partitioner.WholeGraphStepPartitioner;
032import cascading.flow.planner.rule.transformer.ApplyAssertionLevelTransformer;
033import cascading.flow.planner.rule.transformer.ApplyDebugLevelTransformer;
034import cascading.flow.planner.rule.transformer.RemoveNoOpPipeTransformer;
035import cascading.flow.tez.planner.rule.annotator.AccumulatedPostNodeAnnotator;
036import cascading.flow.tez.planner.rule.assertion.DualStreamedAccumulatedMergeNodeAssert;
037import cascading.flow.tez.planner.rule.partitioner.BottomUpBoundariesNodePartitioner;
038import cascading.flow.tez.planner.rule.partitioner.BottomUpJoinedBoundariesNodePartitioner;
039import cascading.flow.tez.planner.rule.partitioner.BottomUpJoinedBoundariesTriangleNodePartitioner;
040import cascading.flow.tez.planner.rule.partitioner.ConsecutiveGroupOrMergesNodePartitioner;
041import cascading.flow.tez.planner.rule.partitioner.SplitJoinBoundariesNodeRePartitioner;
042import cascading.flow.tez.planner.rule.partitioner.StreamedAccumulatedBoundariesNodeRePartitioner;
043import cascading.flow.tez.planner.rule.partitioner.StreamedOnlySourcesNodeRePartitioner;
044import cascading.flow.tez.planner.rule.partitioner.TopDownSplitBoundariesNodePartitioner;
045import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceCheckpointTransformer;
046import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupBlockingHashJoinTransformer;
047import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupSplitHashJoinTransformer;
048import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupSplitSpliceTransformer;
049import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceGroupSplitTransformer;
050import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceHashJoinSameSourceTransformer;
051import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceHashJoinToHashJoinTransformer;
052import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceJoinSplitTransformer;
053import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceSplitSplitToStreamedHashJoinTransformer;
054import cascading.flow.tez.planner.rule.transformer.BoundaryBalanceSplitToStreamedHashJoinTransformer;
055import cascading.flow.tez.planner.rule.transformer.RemoveMalformedHashJoinNodeTransformer;
056
057/**
058 * The HashJoinHadoop2TezRuleRegistry provides support for assemblies using {@link cascading.pipe.HashJoin} pipes.
059 * <p>
060 * Detecting and optimizing for HashJoin pipes adds further complexity and time to converge on a valid physical plan.
061 * <p>
062 * If facing slowdowns, and no HashJoins are used, switch to the
063 * {@link cascading.flow.tez.planner.NoHashJoinHadoop2TezRuleRegistry} via the appropriate
064 * {@link cascading.flow.FlowConnector} constructor.
065 */
066public class HashJoinHadoop2TezRuleRegistry extends RuleRegistry
067  {
068  public HashJoinHadoop2TezRuleRegistry()
069    {
070//    enableDebugLogging();
071
072    // PreBalance
073    addRule( new LoneGroupAssert() );
074    addRule( new MissingGroupAssert() );
075    addRule( new BufferAfterEveryAssert() );
076    addRule( new EveryAfterBufferAssert() );
077    addRule( new SplitBeforeEveryAssert() );
078
079    addRule( new BoundaryBalanceGroupSplitTransformer() );
080    addRule( new BoundaryBalanceGroupSplitSpliceTransformer() ); // prevents AssemblyHelpersPlatformTest#testSameSourceMerge deadlock
081    addRule( new BoundaryBalanceCheckpointTransformer() );
082
083    // hash join
084    addRule( new BoundaryBalanceHashJoinSameSourceTransformer() );
085    addRule( new BoundaryBalanceSplitToStreamedHashJoinTransformer() ); // testGroupBySplitGroupByJoin
086    addRule( new BoundaryBalanceSplitSplitToStreamedHashJoinTransformer() ); // testGroupBySplitSplitGroupByJoin
087    addRule( new BoundaryBalanceHashJoinToHashJoinTransformer() ); // force HJ into unique nodes
088    addRule( new BoundaryBalanceGroupBlockingHashJoinTransformer() ); // joinAfterEvery
089    addRule( new BoundaryBalanceGroupSplitHashJoinTransformer() ); // groupBySplitJoins
090    addRule( new BoundaryBalanceJoinSplitTransformer() ); // prevents duplication of HashJoin, testJoinSplit
091
092    // PreResolve
093    addRule( new RemoveNoOpPipeTransformer() );
094    addRule( new ApplyAssertionLevelTransformer() );
095    addRule( new ApplyDebugLevelTransformer() );
096    addRule( new LogicalMergeAnnotator() ); // MergePipesPlatformTest#testSameSourceMergeHashJoin
097    addRule( new ReplaceAccumulateTapWithDistCacheTransformer() );
098
099    // PostResolve
100
101    // PartitionSteps
102    addRule( new WholeGraphStepPartitioner() );
103
104    // PostSteps
105
106    // PartitionNodes
107
108    // no match with HashJoin inclusion
109    addRule( new TopDownSplitBoundariesNodePartitioner() ); // split from source to multiple sinks
110    addRule( new ConsecutiveGroupOrMergesNodePartitioner() );
111    addRule( new BottomUpBoundariesNodePartitioner() ); // streamed paths re-partitioned w/ StreamedOnly
112    addRule( new SplitJoinBoundariesNodeRePartitioner() ); // testCoGroupSelf - compensates for tez-1190
113
114    // hash join inclusion
115    addRule( new BottomUpJoinedBoundariesNodePartitioner() ); // will capture multiple inputs into sink for use with HashJoins
116    addRule( new BottomUpJoinedBoundariesTriangleNodePartitioner() ); // will capture multiple inputs into sink for use with HashJoins
117    addRule( new StreamedAccumulatedBoundariesNodeRePartitioner() ); // joinsIntoCoGroupLhs & groupBySplitJoins
118    addRule( new StreamedOnlySourcesNodeRePartitioner() );
119
120    // PostNodes
121    addRule( new RemoveMalformedHashJoinNodeTransformer() ); // joinsIntoCoGroupLhs
122    addRule( new AccumulatedPostNodeAnnotator() ); // allows accumulated boundaries to be identified
123
124    addRule( new DualStreamedAccumulatedMergeNodeAssert() );
125    }
126  }