001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.nested.json.hadoop2;
022
023import java.io.ByteArrayInputStream;
024import java.io.ByteArrayOutputStream;
025import java.io.IOException;
026import java.io.InputStreamReader;
027import java.io.OutputStreamWriter;
028import java.nio.charset.Charset;
029
030import cascading.flow.FlowProcess;
031import cascading.nested.json.JSONCoercibleType;
032import cascading.scheme.SinkCall;
033import cascading.scheme.SourceCall;
034import cascading.scheme.hadoop.TextLine;
035import cascading.tuple.Fields;
036import cascading.tuple.Tuple;
037import cascading.tuple.TupleEntry;
038import com.fasterxml.jackson.databind.DeserializationFeature;
039import com.fasterxml.jackson.databind.JsonNode;
040import com.fasterxml.jackson.databind.ObjectMapper;
041import org.apache.hadoop.conf.Configuration;
042import org.apache.hadoop.io.Text;
043import org.apache.hadoop.mapred.OutputCollector;
044import org.apache.hadoop.mapred.RecordReader;
045
046/**
047 * A JSONTextLine is a type of {@link cascading.scheme.Scheme} for JSON text files. Files are broken into
048 * lines, where each line is a JSON object. Either line-feed or carriage-return are used to signal end of line.
049 * <p>
050 * By default, this scheme returns a {@link Tuple} with one field, "json" with the type {@link JSONCoercibleType}.
051 * <p>
052 * Any {@link Fields} object passed to the constructor will have the JSONCoercibleType.TYPE type applied.
053 * <p>
054 * To create a binary JSON file, use the {@link cascading.scheme.hadoop.SequenceFile} Scheme with one or more
055 * fields having the JSONCoercibleType type.
056 * <p>
057 * Note, when supplying a custom {@link ObjectMapper}, the default {@link JSONCoercibleType#TYPE} and ObjectMapper
058 * sets the {@link DeserializationFeature#FAIL_ON_READING_DUP_TREE_KEY} Jackson property.
059 */
060public class JSONTextLine extends TextLine
061  {
062  public static final Fields DEFAULT_FIELDS = new Fields( "json" ).applyTypes( JSONCoercibleType.TYPE );
063
064  private ObjectMapper mapper = new ObjectMapper();
065
066  {
067  // prevents json object from being created with duplicate names at the same level
068  mapper.setConfig( mapper.getDeserializationConfig()
069    .with( DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY ) );
070  }
071
072  /**
073   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
074   * Hadoop based {@link cascading.flow.FlowConnector} instances returning results
075   * with the default field named "json".
076   */
077  public JSONTextLine()
078    {
079    this( DEFAULT_FIELDS );
080    }
081
082  /**
083   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
084   * Hadoop based {@link cascading.flow.FlowConnector} instances.
085   *
086   * @param fields of Fields
087   */
088  public JSONTextLine( Fields fields )
089    {
090    this( fields, null, DEFAULT_CHARSET );
091    }
092
093  /**
094   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
095   * Hadoop based {@link cascading.flow.FlowConnector} instances.
096   *
097   * @param fields      of Fields
098   * @param charsetName of String
099   */
100  public JSONTextLine( Fields fields, String charsetName )
101    {
102    this( fields, null, charsetName );
103    }
104
105  /**
106   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
107   * Hadoop based {@link cascading.flow.FlowConnector} instances.
108   *
109   * @param fields          of Fields
110   * @param sinkCompression of Compress
111   */
112  public JSONTextLine( Fields fields, Compress sinkCompression )
113    {
114    this( fields, sinkCompression, DEFAULT_CHARSET );
115    }
116
117  /**
118   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
119   * Hadoop based {@link cascading.flow.FlowConnector} instances.
120   *
121   * @param fields          of Fields
122   * @param sinkCompression of Compress
123   * @param charsetName     of String
124   */
125  public JSONTextLine( Fields fields, Compress sinkCompression, String charsetName )
126    {
127    this( null, fields, sinkCompression, charsetName );
128    }
129
130  /**
131   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
132   * Hadoop based {@link cascading.flow.FlowConnector} instances.
133   *
134   * @param mapper of ObjectMapper
135   * @param fields of Fields
136   */
137  public JSONTextLine( ObjectMapper mapper, Fields fields )
138    {
139    this( mapper, fields, null, DEFAULT_CHARSET );
140    }
141
142  /**
143   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
144   * Hadoop based {@link cascading.flow.FlowConnector} instances.
145   *
146   * @param mapper      of ObjectMapper
147   * @param fields      of Fields
148   * @param charsetName of String
149   */
150  public JSONTextLine( ObjectMapper mapper, Fields fields, String charsetName )
151    {
152    this( mapper, fields, null, charsetName );
153    }
154
155  /**
156   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
157   * Hadoop based {@link cascading.flow.FlowConnector} instances.
158   *
159   * @param mapper          of ObjectMapper
160   * @param fields          of Fields
161   * @param sinkCompression of Compress
162   */
163  public JSONTextLine( ObjectMapper mapper, Fields fields, Compress sinkCompression )
164    {
165    this( mapper, fields, sinkCompression, DEFAULT_CHARSET );
166    }
167
168  /**
169   * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the
170   * Hadoop based {@link cascading.flow.FlowConnector} instances.
171   *
172   * @param mapper          of ObjectMapper
173   * @param fields          of Fields
174   * @param sinkCompression of Compress
175   * @param charsetName     of String
176   */
177  public JSONTextLine( ObjectMapper mapper, Fields fields, Compress sinkCompression, String charsetName )
178    {
179    super( sinkCompression );
180
181    if( mapper != null )
182      this.mapper = mapper;
183
184    if( fields == null )
185      throw new IllegalArgumentException( "fields may not be null" );
186
187    if( !fields.isDefined() )
188      throw new IllegalArgumentException( "fields argument must declare a single field" );
189
190    if( fields.size() != 1 )
191      throw new IllegalArgumentException( "may only declare a single source/sink field in the fields argument" );
192
193    fields = fields.hasTypes() ? fields : fields.applyTypes( new JSONCoercibleType( this.mapper ) );
194
195    setSinkFields( fields );
196    setSourceFields( fields );
197
198    // throws an exception if not found
199    setCharsetName( charsetName );
200    }
201
202  @Override
203  protected void sourceHandleInput( SourceCall<Object[], RecordReader> sourceCall ) throws IOException
204    {
205    TupleEntry result = sourceCall.getIncomingEntry();
206
207    Object[] context = sourceCall.getContext();
208
209    Text text = (Text) context[ 1 ];
210    JsonNode jsonNode = null;
211
212    if( text.getLength() != 0 )
213      {
214      ByteArrayInputStream inputStream = new ByteArrayInputStream( text.getBytes(), 0, text.getLength() );
215      InputStreamReader reader = new InputStreamReader( inputStream, (Charset) context[ 2 ] );
216      jsonNode = mapper.readTree( reader );
217      }
218
219    result.setObject( 0, jsonNode );
220    }
221
222  @Override
223  public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
224    {
225    Text text = (Text) sinkCall.getContext()[ 0 ];
226    Charset charset = (Charset) sinkCall.getContext()[ 1 ];
227
228    JsonNode jsonNode = (JsonNode) sinkCall.getOutgoingEntry().getTuple().getObject( 0 );
229
230    if( jsonNode == null )
231      {
232      text.set( "" );
233      }
234    else
235      {
236      ByteArrayOutputStream outputStream = new ByteArrayOutputStream( 1024 );
237      OutputStreamWriter writer = new OutputStreamWriter( outputStream, charset );
238
239      mapper.writeValue( writer, jsonNode );
240
241      writer.close();
242
243      text.set( outputStream.toByteArray() );
244      }
245
246    // it's ok to use NULL here so the collector does not write anything
247    sinkCall.getOutput().collect( null, text );
248    }
249
250  @Override
251  public String getExtension()
252    {
253    return "json";
254    }
255  }