001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.nested.json.hadoop2; 022 023import java.io.ByteArrayInputStream; 024import java.io.ByteArrayOutputStream; 025import java.io.IOException; 026import java.io.InputStreamReader; 027import java.io.OutputStreamWriter; 028import java.nio.charset.Charset; 029 030import cascading.flow.FlowProcess; 031import cascading.nested.json.JSONCoercibleType; 032import cascading.scheme.SinkCall; 033import cascading.scheme.SourceCall; 034import cascading.scheme.hadoop.TextLine; 035import cascading.tuple.Fields; 036import cascading.tuple.Tuple; 037import cascading.tuple.TupleEntry; 038import com.fasterxml.jackson.databind.DeserializationFeature; 039import com.fasterxml.jackson.databind.JsonNode; 040import com.fasterxml.jackson.databind.ObjectMapper; 041import org.apache.hadoop.conf.Configuration; 042import org.apache.hadoop.io.Text; 043import org.apache.hadoop.mapred.OutputCollector; 044import org.apache.hadoop.mapred.RecordReader; 045 046/** 047 * A JSONTextLine is a type of {@link cascading.scheme.Scheme} for JSON text files. Files are broken into 048 * lines, where each line is a JSON object. Either line-feed or carriage-return are used to signal end of line. 049 * <p> 050 * By default, this scheme returns a {@link Tuple} with one field, "json" with the type {@link JSONCoercibleType}. 051 * <p> 052 * Any {@link Fields} object passed to the constructor will have the JSONCoercibleType.TYPE type applied. 053 * <p> 054 * To create a binary JSON file, use the {@link cascading.scheme.hadoop.SequenceFile} Scheme with one or more 055 * fields having the JSONCoercibleType type. 056 * <p> 057 * Note, when supplying a custom {@link ObjectMapper}, the default {@link JSONCoercibleType#TYPE} and ObjectMapper 058 * sets the {@link DeserializationFeature#FAIL_ON_READING_DUP_TREE_KEY} Jackson property. 059 */ 060public class JSONTextLine extends TextLine 061 { 062 public static final Fields DEFAULT_FIELDS = new Fields( "json" ).applyTypes( JSONCoercibleType.TYPE ); 063 064 private ObjectMapper mapper = new ObjectMapper(); 065 066 { 067 // prevents json object from being created with duplicate names at the same level 068 mapper.setConfig( mapper.getDeserializationConfig() 069 .with( DeserializationFeature.FAIL_ON_READING_DUP_TREE_KEY ) ); 070 } 071 072 /** 073 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 074 * Hadoop based {@link cascading.flow.FlowConnector} instances returning results 075 * with the default field named "json". 076 */ 077 public JSONTextLine() 078 { 079 this( DEFAULT_FIELDS ); 080 } 081 082 /** 083 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 084 * Hadoop based {@link cascading.flow.FlowConnector} instances. 085 * 086 * @param fields of Fields 087 */ 088 public JSONTextLine( Fields fields ) 089 { 090 this( fields, null, DEFAULT_CHARSET ); 091 } 092 093 /** 094 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 095 * Hadoop based {@link cascading.flow.FlowConnector} instances. 096 * 097 * @param fields of Fields 098 * @param charsetName of String 099 */ 100 public JSONTextLine( Fields fields, String charsetName ) 101 { 102 this( fields, null, charsetName ); 103 } 104 105 /** 106 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 107 * Hadoop based {@link cascading.flow.FlowConnector} instances. 108 * 109 * @param fields of Fields 110 * @param sinkCompression of Compress 111 */ 112 public JSONTextLine( Fields fields, Compress sinkCompression ) 113 { 114 this( fields, sinkCompression, DEFAULT_CHARSET ); 115 } 116 117 /** 118 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 119 * Hadoop based {@link cascading.flow.FlowConnector} instances. 120 * 121 * @param fields of Fields 122 * @param sinkCompression of Compress 123 * @param charsetName of String 124 */ 125 public JSONTextLine( Fields fields, Compress sinkCompression, String charsetName ) 126 { 127 this( null, fields, sinkCompression, charsetName ); 128 } 129 130 /** 131 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 132 * Hadoop based {@link cascading.flow.FlowConnector} instances. 133 * 134 * @param mapper of ObjectMapper 135 * @param fields of Fields 136 */ 137 public JSONTextLine( ObjectMapper mapper, Fields fields ) 138 { 139 this( mapper, fields, null, DEFAULT_CHARSET ); 140 } 141 142 /** 143 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 144 * Hadoop based {@link cascading.flow.FlowConnector} instances. 145 * 146 * @param mapper of ObjectMapper 147 * @param fields of Fields 148 * @param charsetName of String 149 */ 150 public JSONTextLine( ObjectMapper mapper, Fields fields, String charsetName ) 151 { 152 this( mapper, fields, null, charsetName ); 153 } 154 155 /** 156 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 157 * Hadoop based {@link cascading.flow.FlowConnector} instances. 158 * 159 * @param mapper of ObjectMapper 160 * @param fields of Fields 161 * @param sinkCompression of Compress 162 */ 163 public JSONTextLine( ObjectMapper mapper, Fields fields, Compress sinkCompression ) 164 { 165 this( mapper, fields, sinkCompression, DEFAULT_CHARSET ); 166 } 167 168 /** 169 * Constructor JSONTextLine creates a new JSONTextLine instance for use with any of the 170 * Hadoop based {@link cascading.flow.FlowConnector} instances. 171 * 172 * @param mapper of ObjectMapper 173 * @param fields of Fields 174 * @param sinkCompression of Compress 175 * @param charsetName of String 176 */ 177 public JSONTextLine( ObjectMapper mapper, Fields fields, Compress sinkCompression, String charsetName ) 178 { 179 super( sinkCompression ); 180 181 if( mapper != null ) 182 this.mapper = mapper; 183 184 if( fields == null ) 185 throw new IllegalArgumentException( "fields may not be null" ); 186 187 if( !fields.isDefined() ) 188 throw new IllegalArgumentException( "fields argument must declare a single field" ); 189 190 if( fields.size() != 1 ) 191 throw new IllegalArgumentException( "may only declare a single source/sink field in the fields argument" ); 192 193 fields = fields.hasTypes() ? fields : fields.applyTypes( new JSONCoercibleType( this.mapper ) ); 194 195 setSinkFields( fields ); 196 setSourceFields( fields ); 197 198 // throws an exception if not found 199 setCharsetName( charsetName ); 200 } 201 202 @Override 203 protected void sourceHandleInput( SourceCall<Object[], RecordReader> sourceCall ) throws IOException 204 { 205 TupleEntry result = sourceCall.getIncomingEntry(); 206 207 Object[] context = sourceCall.getContext(); 208 209 Text text = (Text) context[ 1 ]; 210 JsonNode jsonNode = null; 211 212 if( text.getLength() != 0 ) 213 { 214 ByteArrayInputStream inputStream = new ByteArrayInputStream( text.getBytes(), 0, text.getLength() ); 215 InputStreamReader reader = new InputStreamReader( inputStream, (Charset) context[ 2 ] ); 216 jsonNode = mapper.readTree( reader ); 217 } 218 219 result.setObject( 0, jsonNode ); 220 } 221 222 @Override 223 public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 224 { 225 Text text = (Text) sinkCall.getContext()[ 0 ]; 226 Charset charset = (Charset) sinkCall.getContext()[ 1 ]; 227 228 JsonNode jsonNode = (JsonNode) sinkCall.getOutgoingEntry().getTuple().getObject( 0 ); 229 230 if( jsonNode == null ) 231 { 232 text.set( "" ); 233 } 234 else 235 { 236 ByteArrayOutputStream outputStream = new ByteArrayOutputStream( 1024 ); 237 OutputStreamWriter writer = new OutputStreamWriter( outputStream, charset ); 238 239 mapper.writeValue( writer, jsonNode ); 240 241 writer.close(); 242 243 text.set( outputStream.toByteArray() ); 244 } 245 246 // it's ok to use NULL here so the collector does not write anything 247 sinkCall.getOutput().collect( null, text ); 248 } 249 250 @Override 251 public String getExtension() 252 { 253 return "json"; 254 } 255 }