001/*
002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved.
003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved.
004 *
005 * Project and contact information: http://www.cascading.org/
006 *
007 * This file is part of the Cascading project.
008 *
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *     http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 */
021
022package cascading.scheme.hadoop;
023
024import java.beans.ConstructorProperties;
025import java.io.IOException;
026import java.nio.charset.Charset;
027
028import cascading.flow.FlowProcess;
029import cascading.management.annotation.Property;
030import cascading.management.annotation.PropertyDescription;
031import cascading.management.annotation.Visibility;
032import cascading.scheme.SinkCall;
033import cascading.scheme.SourceCall;
034import cascading.scheme.util.DelimitedParser;
035import cascading.tap.CompositeTap;
036import cascading.tap.Tap;
037import cascading.tap.TapException;
038import cascading.tap.hadoop.Hfs;
039import cascading.tap.type.TapWith;
040import cascading.tuple.Fields;
041import cascading.tuple.Tuple;
042import cascading.tuple.TupleEntry;
043import cascading.tuple.util.TupleViews;
044import org.apache.hadoop.conf.Configuration;
045import org.apache.hadoop.io.LongWritable;
046import org.apache.hadoop.io.Text;
047import org.apache.hadoop.mapred.OutputCollector;
048import org.apache.hadoop.mapred.RecordReader;
049
050/**
051 * Class TextDelimited is a sub-class of {@link TextLine}. It provides direct support for delimited text files, like
052 * TAB (\t) or COMMA (,) delimited files. It also optionally allows for quoted values.
053 * <p>
054 * TextDelimited may also be used to skip the "header" in a file, where the header is defined as the very first line
055 * in every input file. That is, if the byte offset of the current line from the input is zero (0), that line will
056 * be skipped.
057 * <p>
058 * It is assumed if sink/source {@code fields} is set to either {@link Fields#ALL} or {@link Fields#UNKNOWN} and
059 * {@code skipHeader} or {@code hasHeader} is {@code true}, the field names will be retrieved from the header of the
060 * file and used during planning. The header will parsed with the same rules as the body of the file.
061 * <p>
062 * By default headers are not skipped.
063 * <p>
064 * TextDelimited may also be used to write a "header" in a file. The fields names for the header are taken directly
065 * from the declared fields. Or if the declared fields are {@link Fields#ALL} or {@link Fields#UNKNOWN}, the
066 * resolved field names will be used, if any.
067 * <p>
068 * By default headers are not written.
069 * <p>
070 * If {@code hasHeaders} is set to {@code true} on a constructor, both {@code skipHeader} and {@code writeHeader} will
071 * be set to {@code true}.
072 * <p>
073 * By default this {@link cascading.scheme.Scheme} is both {@code strict} and {@code safe}.
074 * <p>
075 * Strict meaning if a line of text does not parse into the expected number of fields, this class will throw a
076 * {@link TapException}. If strict is {@code false}, then {@link Tuple} will be returned with {@code null} values
077 * for the missing fields.
078 * <p>
079 * Safe meaning if a field cannot be coerced into an expected type, a {@code null} will be used for the value.
080 * If safe is {@code false}, a {@link TapException} will be thrown.
081 * <p>
082 * Also by default, {@code quote} strings are not searched for to improve processing speed. If a file is
083 * COMMA delimited but may have COMMA's in a value, the whole value should be surrounded by the quote string, typically
084 * double quotes ({@literal "}).
085 * <p>
086 * Note all empty fields in a line will be returned as {@code null} unless coerced into a new type.
087 * <p>
088 * This Scheme may source/sink {@link Fields#ALL}, when given on the constructor the new instance will automatically
089 * default to strict == false as the number of fields parsed are arbitrary or unknown. A type array may not be given
090 * either, so all values will be returned as Strings.
091 * <p>
092 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor
093 * argument.
094 * <p>
095 * To override field and line parsing behaviors, sub-class {@link DelimitedParser} or provide a
096 * {@link cascading.scheme.util.FieldTypeResolver} implementation.
097 * <p>
098 * Note that there should be no expectation that TextDelimited, or specifically {@link DelimitedParser}, can handle
099 * all delimited and quoted combinations reliably. Attempting to do so would impair its performance and maintainability.
100 * <p>
101 * Further, it can be safely said any corrupted files will not be supported for obvious reasons. Corrupted files may
102 * result in exceptions or could cause edge cases in the underlying java regular expression engine.
103 * <p>
104 * A large part of Cascading was designed to help users cleans data. Thus the recommendation is to create Flows that
105 * are responsible for cleansing large data-sets when faced with the problem
106 * <p>
107 * DelimitedParser maybe sub-classed and extended if necessary.
108 *
109 * @see TextLine
110 */
111public class TextDelimited extends TextLine
112  {
113  public static final String DEFAULT_CHARSET = "UTF-8";
114
115  /** Field delimitedParser */
116  protected final DelimitedParser delimitedParser;
117  /** Field skipHeader */
118  private boolean skipHeader;
119  private final boolean writeHeader;
120
121  /**
122   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
123   * {@link Fields#ALL} and using TAB as the default delimiter.
124   * <p>
125   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
126   * with a {@link cascading.pipe.Checkpoint} Tap.
127   */
128  public TextDelimited()
129    {
130    this( Fields.ALL, null, "\t", null, null );
131    }
132
133  /**
134   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
135   * {@link Fields#ALL} and using TAB as the default delimiter.
136   * <p>
137   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
138   * with a {@link cascading.pipe.Checkpoint} Tap.
139   *
140   * @param hasHeader of type boolean
141   * @param delimiter of type String
142   */
143  @ConstructorProperties({"hasHeader", "delimiter"})
144  public TextDelimited( boolean hasHeader, String delimiter )
145    {
146    this( Fields.ALL, null, hasHeader, delimiter, null, (Class[]) null );
147    }
148
149  /**
150   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
151   * {@link Fields#ALL} and using TAB as the default delimiter.
152   * <p>
153   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
154   * with a {@link cascading.pipe.Checkpoint} Tap.
155   *
156   * @param hasHeader of type boolean
157   * @param delimiter of type String
158   * @param quote     of type String
159   */
160  @ConstructorProperties({"hasHeader", "delimiter", "quote"})
161  public TextDelimited( boolean hasHeader, String delimiter, String quote )
162    {
163    this( Fields.ALL, null, hasHeader, delimiter, quote, (Class[]) null );
164    }
165
166  /**
167   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
168   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
169   * <p>
170   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
171   * with a {@link cascading.pipe.Checkpoint} Tap.
172   *
173   * @param hasHeader       of type boolean
174   * @param delimitedParser of type DelimitedParser
175   */
176  @ConstructorProperties({"hasHeader", "delimitedParser"})
177  public TextDelimited( boolean hasHeader, DelimitedParser delimitedParser )
178    {
179    this( Fields.ALL, null, hasHeader, hasHeader, delimitedParser );
180    }
181
182  /**
183   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
184   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
185   * <p>
186   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
187   * with a {@link cascading.pipe.Checkpoint} Tap.
188   * <p>
189   * This constructor will set {@code skipHeader} and {@code writeHeader} values to true.
190   *
191   * @param delimitedParser of type DelimitedParser
192   */
193  @ConstructorProperties({"delimitedParser"})
194  public TextDelimited( DelimitedParser delimitedParser )
195    {
196    this( Fields.ALL, null, true, true, delimitedParser );
197    }
198
199  /**
200   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
201   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
202   * <p>
203   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
204   * with a {@link cascading.pipe.Checkpoint} Tap.
205   *
206   * @param sinkCompression of type Compress
207   * @param hasHeader       of type boolean
208   * @param delimitedParser of type DelimitedParser
209   */
210  @ConstructorProperties({"sinkCompression", "hasHeader", "delimitedParser"})
211  public TextDelimited( Compress sinkCompression, boolean hasHeader, DelimitedParser delimitedParser )
212    {
213    this( Fields.ALL, sinkCompression, hasHeader, hasHeader, delimitedParser );
214    }
215
216  /**
217   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
218   * {@link Fields#ALL} and using the given delimitedParser instance for parsing.
219   * <p>
220   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
221   * with a {@link cascading.pipe.Checkpoint} Tap.
222   * <p>
223   * This constructor will set {@code skipHeader} and {@code writeHeader} values to true.
224   *
225   * @param delimitedParser of type DelimitedParser
226   */
227  @ConstructorProperties({"sinkCompression", "delimitedParser"})
228  public TextDelimited( Compress sinkCompression, DelimitedParser delimitedParser )
229    {
230    this( Fields.ALL, sinkCompression, true, true, delimitedParser );
231    }
232
233  /**
234   * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking
235   * {@link Fields#ALL} and using TAB as the default delimiter.
236   * <p>
237   * Use this constructor if the source and sink fields will be resolved during planning, for example, when using
238   * with a {@link cascading.pipe.Checkpoint} Tap.
239   *
240   * @param sinkCompression of type Compress
241   * @param hasHeader       of type boolean
242   * @param delimiter       of type String
243   * @param quote           of type String
244   */
245  @ConstructorProperties({"sinkCompression", "hasHeader", "delimiter", "quote"})
246  public TextDelimited( Compress sinkCompression, boolean hasHeader, String delimiter, String quote )
247    {
248    this( Fields.ALL, sinkCompression, hasHeader, delimiter, quote, (Class[]) null );
249    }
250
251  /**
252   * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter.
253   *
254   * @param fields of type Fields
255   */
256  @ConstructorProperties({"fields"})
257  public TextDelimited( Fields fields )
258    {
259    this( fields, null, "\t", null, null );
260    }
261
262  /**
263   * Constructor TextDelimited creates a new TextDelimited instance.
264   *
265   * @param fields    of type Fields
266   * @param delimiter of type String
267   */
268  @ConstructorProperties({"fields", "delimiter"})
269  public TextDelimited( Fields fields, String delimiter )
270    {
271    this( fields, null, delimiter, null, null );
272    }
273
274  /**
275   * Constructor TextDelimited creates a new TextDelimited instance.
276   *
277   * @param fields    of type Fields
278   * @param hasHeader of type boolean
279   * @param delimiter of type String
280   */
281  @ConstructorProperties({"fields", "hasHeader", "delimiter"})
282  public TextDelimited( Fields fields, boolean hasHeader, String delimiter )
283    {
284    this( fields, null, hasHeader, hasHeader, delimiter, null, null );
285    }
286
287  /**
288   * Constructor TextDelimited creates a new TextDelimited instance.
289   *
290   * @param fields      of type Fields
291   * @param skipHeader  of type boolean
292   * @param writeHeader of type boolean
293   * @param delimiter   of type String
294   */
295  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter"})
296  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter )
297    {
298    this( fields, null, skipHeader, writeHeader, delimiter, null, null );
299    }
300
301  /**
302   * Constructor TextDelimited creates a new TextDelimited instance.
303   *
304   * @param fields    of type Fields
305   * @param delimiter of type String
306   * @param types     of type Class[]
307   */
308  @ConstructorProperties({"fields", "delimiter", "types"})
309  public TextDelimited( Fields fields, String delimiter, Class[] types )
310    {
311    this( fields, null, delimiter, null, types );
312    }
313
314  /**
315   * Constructor TextDelimited creates a new TextDelimited instance.
316   *
317   * @param fields    of type Fields
318   * @param hasHeader of type boolean
319   * @param delimiter of type String
320   * @param types     of type Class[]
321   */
322  @ConstructorProperties({"fields", "hasHeader", "delimiter", "types"})
323  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, Class[] types )
324    {
325    this( fields, null, hasHeader, hasHeader, delimiter, null, types );
326    }
327
328  /**
329   * Constructor TextDelimited creates a new TextDelimited instance.
330   *
331   * @param fields      of type Fields
332   * @param skipHeader  of type boolean
333   * @param writeHeader of type boolean
334   * @param delimiter   of type String
335   * @param types       of type Class[]
336   */
337  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "types"})
338  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types )
339    {
340    this( fields, null, skipHeader, writeHeader, delimiter, null, types );
341    }
342
343  /**
344   * Constructor TextDelimited creates a new TextDelimited instance.
345   *
346   * @param fields    of type Fields
347   * @param delimiter of type String
348   * @param quote     of type String
349   * @param types     of type Class[]
350   */
351  @ConstructorProperties({"fields", "delimiter", "quote", "types"})
352  public TextDelimited( Fields fields, String delimiter, String quote, Class[] types )
353    {
354    this( fields, null, delimiter, quote, types );
355    }
356
357  /**
358   * Constructor TextDelimited creates a new TextDelimited instance.
359   *
360   * @param fields    of type Fields
361   * @param hasHeader of type boolean
362   * @param delimiter of type String
363   * @param quote     of type String
364   * @param types     of type Class[]
365   */
366  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types"})
367  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types )
368    {
369    this( fields, null, hasHeader, hasHeader, delimiter, quote, types );
370    }
371
372  /**
373   * Constructor TextDelimited creates a new TextDelimited instance.
374   *
375   * @param fields      of type Fields
376   * @param skipHeader  of type boolean
377   * @param writeHeader of type boolean
378   * @param delimiter   of type String
379   * @param quote       of type String
380   * @param types       of type Class[]
381   */
382  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types"})
383  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types )
384    {
385    this( fields, null, skipHeader, writeHeader, delimiter, quote, types );
386    }
387
388  /**
389   * Constructor TextDelimited creates a new TextDelimited instance.
390   *
391   * @param fields    of type Fields
392   * @param delimiter of type String
393   * @param quote     of type String
394   * @param types     of type Class[]
395   * @param safe      of type boolean
396   */
397  @ConstructorProperties({"fields", "delimiter", "quote", "types", "safe"})
398  public TextDelimited( Fields fields, String delimiter, String quote, Class[] types, boolean safe )
399    {
400    this( fields, null, delimiter, quote, types, safe );
401    }
402
403  /**
404   * Constructor TextDelimited creates a new TextDelimited instance.
405   *
406   * @param fields    of type Fields
407   * @param hasHeader of type boolean
408   * @param delimiter of type String
409   * @param quote     of type String
410   * @param types     of type Class[]
411   * @param safe      of type boolean
412   */
413  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe"})
414  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe )
415    {
416    this( fields, null, hasHeader, hasHeader, delimiter, quote, types, safe );
417    }
418
419  /**
420   * Constructor TextDelimited creates a new TextDelimited instance.
421   *
422   * @param fields      of type Fields
423   * @param hasHeader   of type boolean
424   * @param delimiter   of type String
425   * @param quote       of type String
426   * @param types       of type Class[]
427   * @param safe        of type boolean
428   * @param charsetName of type String
429   */
430  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"})
431  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName )
432    {
433    this( fields, null, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName );
434    }
435
436  /**
437   * Constructor TextDelimited creates a new TextDelimited instance.
438   *
439   * @param fields      of type Fields
440   * @param skipHeader  of type boolean
441   * @param writeHeader of type boolean
442   * @param delimiter   of type String
443   * @param quote       of type String
444   * @param types       of type Class[]
445   * @param safe        of type boolean
446   */
447  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"})
448  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe )
449    {
450    this( fields, null, skipHeader, writeHeader, delimiter, quote, types, safe );
451    }
452
453  /**
454   * Constructor TextDelimited creates a new TextDelimited instance.
455   *
456   * @param fields          of type Fields
457   * @param sinkCompression of type Compress
458   * @param delimiter       of type String
459   */
460  @ConstructorProperties({"fields", "sinkCompression", "delimiter"})
461  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter )
462    {
463    this( fields, sinkCompression, delimiter, null, null );
464    }
465
466  /**
467   * Constructor TextDelimited creates a new TextDelimited instance.
468   *
469   * @param fields          of type Fields
470   * @param sinkCompression of type Compress
471   * @param hasHeader       of type boolean
472   * @param delimiter       of type String
473   */
474  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter"})
475  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter )
476    {
477    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, null );
478    }
479
480  /**
481   * Constructor TextDelimited creates a new TextDelimited instance.
482   *
483   * @param fields          of type Fields
484   * @param sinkCompression of type Compress
485   * @param skipHeader      of type boolean
486   * @param writeHeader     of type boolean
487   * @param delimiter       of type String
488   */
489  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter"})
490  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter )
491    {
492    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, null );
493    }
494
495  /**
496   * Constructor TextDelimited creates a new TextDelimited instance.
497   *
498   * @param fields          of type Fields
499   * @param sinkCompression of type Compress
500   * @param delimiter       of type String
501   * @param types           of type Class[]
502   */
503  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types"})
504  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types )
505    {
506    this( fields, sinkCompression, delimiter, null, types );
507    }
508
509  /**
510   * Constructor TextDelimited creates a new TextDelimited instance.
511   *
512   * @param fields          of type Fields
513   * @param sinkCompression of type Compress
514   * @param hasHeader       of type boolean
515   * @param delimiter       of type String
516   * @param types           of type Class[]
517   */
518  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types"})
519  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types )
520    {
521    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types );
522    }
523
524  /**
525   * Constructor TextDelimited creates a new TextDelimited instance.
526   *
527   * @param fields          of type Fields
528   * @param sinkCompression of type Compress
529   * @param skipHeader      of type boolean
530   * @param writeHeader     of type boolean
531   * @param delimiter       of type String
532   * @param types           of type Class[]
533   */
534  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types"})
535  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types )
536    {
537    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types );
538    }
539
540  /**
541   * Constructor TextDelimited creates a new TextDelimited instance.
542   *
543   * @param fields          of type Fields
544   * @param sinkCompression of type Compress
545   * @param delimiter       of type String
546   * @param types           of type Class[]
547   * @param safe            of type boolean
548   */
549  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types", "safe"})
550  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types, boolean safe )
551    {
552    this( fields, sinkCompression, delimiter, null, types, safe );
553    }
554
555  /**
556   * Constructor TextDelimited creates a new TextDelimited instance.
557   *
558   * @param fields          of type Fields
559   * @param sinkCompression of type Compress
560   * @param hasHeader       of type boolean
561   * @param delimiter       of type String
562   * @param types           of type Class[]
563   * @param safe            of type boolean
564   */
565  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe"})
566  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe )
567    {
568    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types, safe );
569    }
570
571  /**
572   * Constructor TextDelimited creates a new TextDelimited instance.
573   *
574   * @param fields          of type Fields
575   * @param sinkCompression of type Compress
576   * @param hasHeader       of type boolean
577   * @param delimiter       of type String
578   * @param types           of type Class[]
579   * @param safe            of type boolean
580   * @param charsetName     of type String
581   */
582  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe", "charsetName"})
583  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe, String charsetName )
584    {
585    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, null, types, safe, charsetName );
586    }
587
588  /**
589   * Constructor TextDelimited creates a new TextDelimited instance.
590   *
591   * @param fields          of type Fields
592   * @param sinkCompression of type Compress
593   * @param skipHeader      of type boolean
594   * @param writeHeader     of type boolean
595   * @param delimiter       of type String
596   * @param types           of type Class[]
597   * @param safe            of type boolean
598   */
599  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types", "safe"})
600  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types, boolean safe )
601    {
602    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types, safe );
603    }
604
605  /**
606   * Constructor TextDelimited creates a new TextDelimited instance.
607   *
608   * @param fields    of type Fields
609   * @param delimiter of type String
610   * @param quote     of type String
611   */
612  @ConstructorProperties({"fields", "delimiter", "quote"})
613  public TextDelimited( Fields fields, String delimiter, String quote )
614    {
615    this( fields, null, delimiter, quote );
616    }
617
618  /**
619   * Constructor TextDelimited creates a new TextDelimited instance.
620   *
621   * @param fields    of type Fields
622   * @param hasHeader of type boolean
623   * @param delimiter of type String
624   * @param quote     of type String
625   */
626  @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote"})
627  public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote )
628    {
629    this( fields, null, hasHeader, hasHeader, delimiter, quote );
630    }
631
632  /**
633   * Constructor TextDelimited creates a new TextDelimited instance.
634   *
635   * @param fields      of type Fields
636   * @param skipHeader  of type boolean
637   * @param writeHeader of type boolean
638   * @param delimiter   of type String
639   * @param quote       of type String
640   */
641  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote"})
642  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote )
643    {
644    this( fields, null, skipHeader, writeHeader, delimiter, quote );
645    }
646
647  /**
648   * Constructor TextDelimited creates a new TextDelimited instance.
649   *
650   * @param fields          of type Fields
651   * @param sinkCompression of type Compress
652   * @param delimiter       of type String
653   * @param quote           of type String
654   */
655  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote"})
656  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote )
657    {
658    this( fields, sinkCompression, false, false, delimiter, true, quote, null, true );
659    }
660
661  /**
662   * Constructor TextDelimited creates a new TextDelimited instance.
663   *
664   * @param fields          of type Fields
665   * @param sinkCompression of type Compress
666   * @param hasHeader       of type boolean
667   * @param delimiter       of type String
668   * @param quote           of type String
669   */
670  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote"})
671  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote )
672    {
673    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true );
674    }
675
676  /**
677   * Constructor TextDelimited creates a new TextDelimited instance.
678   *
679   * @param fields          of type Fields
680   * @param sinkCompression of type Compress
681   * @param hasHeader       of type boolean
682   * @param delimiter       of type String
683   * @param quote           of type String
684   * @param charsetName     of type String
685   */
686  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "charsetName"})
687  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, String charsetName )
688    {
689    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true, charsetName );
690    }
691
692  /**
693   * Constructor TextDelimited creates a new TextDelimited instance.
694   *
695   * @param fields          of type Fields
696   * @param sinkCompression of type Compress
697   * @param skipHeader      of type boolean
698   * @param writeHeader     of type boolean
699   * @param delimiter       of type String
700   * @param quote           of type String
701   */
702  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote"})
703  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote )
704    {
705    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, null, true );
706    }
707
708  /**
709   * Constructor TextDelimited creates a new TextDelimited instance.
710   *
711   * @param fields          of type Fields
712   * @param sinkCompression of type Compress
713   * @param delimiter       of type String
714   * @param quote           of type String
715   * @param types           of type Class[]
716   */
717  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types"})
718  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types )
719    {
720    this( fields, sinkCompression, false, false, delimiter, true, quote, types, true );
721    }
722
723  /**
724   * Constructor TextDelimited creates a new TextDelimited instance.
725   *
726   * @param fields          of type Fields
727   * @param sinkCompression of type Compress
728   * @param hasHeader       of type boolean
729   * @param delimiter       of type String
730   * @param quote           of type String
731   * @param types           of type Class[]
732   */
733  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types"})
734  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types )
735    {
736    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, true );
737    }
738
739  /**
740   * Constructor TextDelimited creates a new TextDelimited instance.
741   *
742   * @param fields          of type Fields
743   * @param sinkCompression of type Compress
744   * @param skipHeader      of type boolean
745   * @param writeHeader     of type boolean
746   * @param delimiter       of type String
747   * @param quote           of type String
748   * @param types           of type Class[]
749   */
750  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types"})
751  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types )
752    {
753    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, true );
754    }
755
756  /**
757   * Constructor TextDelimited creates a new TextDelimited instance.
758   *
759   * @param fields          of type Fields
760   * @param sinkCompression of type Compress
761   * @param delimiter       of type String
762   * @param quote           of type String
763   * @param types           of type Class[]
764   * @param safe            of type boolean
765   */
766  @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types", "safe"})
767  public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types, boolean safe )
768    {
769    this( fields, sinkCompression, false, false, delimiter, true, quote, types, safe );
770    }
771
772  /**
773   * Constructor TextDelimited creates a new TextDelimited instance.
774   *
775   * @param fields          of type Fields
776   * @param sinkCompression of type Compress
777   * @param hasHeader       of type boolean
778   * @param delimiter       of type String
779   * @param quote           of type String
780   * @param types           of type Class[]
781   * @param safe            of type boolean
782   */
783  @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types", "safe"})
784  public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe )
785    {
786    this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, safe );
787    }
788
789  /**
790   * Constructor TextDelimited creates a new TextDelimited instance.
791   *
792   * @param fields          of type Fields
793   * @param sinkCompression of type Compress
794   * @param skipHeader      of type boolean
795   * @param writeHeader     of type boolean
796   * @param delimiter       of type String
797   * @param quote           of type String
798   * @param types           of type Class[]
799   * @param safe            of type boolean
800   */
801  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types",
802                          "safe"})
803  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe )
804    {
805    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, safe );
806    }
807
808  /**
809   * Constructor TextDelimited creates a new TextDelimited instance.
810   *
811   * @param fields          of type Fields
812   * @param sinkCompression of type Compress
813   * @param skipHeader      of type boolean
814   * @param delimiter       of type String
815   * @param strict          of type boolean
816   * @param quote           of type String
817   * @param types           of type Class[]
818   * @param safe            of type boolean
819   */
820  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote",
821                          "types", "safe"})
822  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe )
823    {
824    this( fields, sinkCompression, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET );
825    }
826
827  /**
828   * Constructor TextDelimited creates a new TextDelimited instance.
829   *
830   * @param fields          of type Fields
831   * @param sinkCompression of type Compress
832   * @param skipHeader      of type boolean
833   * @param delimiter       of type String
834   * @param strict          of type boolean
835   * @param quote           of type String
836   * @param types           of type Class[]
837   * @param safe            of type boolean
838   * @param charsetName     of type String
839   */
840  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote",
841                          "types", "safe", "charsetName"})
842  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName )
843    {
844    this( fields, sinkCompression, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) );
845    }
846
847  /**
848   * Constructor TextDelimited creates a new TextDelimited instance.
849   *
850   * @param fields          of type Fields
851   * @param writeHeader     of type boolean
852   * @param delimitedParser of type DelimitedParser
853   */
854  @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimitedParser"})
855  public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser )
856    {
857    this( fields, null, skipHeader, writeHeader, null, delimitedParser );
858    }
859
860  /**
861   * Constructor TextDelimited creates a new TextDelimited instance.
862   *
863   * @param fields          of type Fields
864   * @param hasHeader       of type boolean
865   * @param delimitedParser of type DelimitedParser
866   */
867  @ConstructorProperties({"fields", "hasHeader", "delimitedParser"})
868  public TextDelimited( Fields fields, boolean hasHeader, DelimitedParser delimitedParser )
869    {
870    this( fields, null, hasHeader, hasHeader, null, delimitedParser );
871    }
872
873  /**
874   * Constructor TextDelimited creates a new TextDelimited instance.
875   *
876   * @param fields          of type Fields
877   * @param writeHeader     of type boolean
878   * @param delimitedParser of type DelimitedParser
879   */
880  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimitedParser"})
881  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser )
882    {
883    this( fields, sinkCompression, skipHeader, writeHeader, null, delimitedParser );
884    }
885
886  /**
887   * Constructor TextDelimited creates a new TextDelimited instance.
888   *
889   * @param fields          of type Fields
890   * @param sinkCompression of type Compress
891   * @param skipHeader      of type boolean
892   * @param writeHeader     of type boolean
893   * @param charsetName     of type String
894   * @param delimitedParser of type DelimitedParser
895   */
896  @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "charsetName", "delimitedParser"})
897  public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser )
898    {
899    super( sinkCompression );
900
901    this.delimitedParser = delimitedParser;
902
903    // normalizes ALL and UNKNOWN
904    setSinkFields( fields );
905    setSourceFields( fields );
906
907    this.skipHeader = skipHeader;
908    this.writeHeader = writeHeader;
909
910    // throws an exception if not found
911    setCharsetName( charsetName );
912    }
913
914  /**
915   * Method getDelimiter returns the delimiter used to parse fields from the current line of text.
916   *
917   * @return a String
918   */
919  @Property(name = "delimiter", visibility = Visibility.PUBLIC)
920  @PropertyDescription("The delimiter used to separate fields.")
921  public String getDelimiter()
922    {
923    return delimitedParser.getDelimiter();
924    }
925
926  /**
927   * Method getQuote returns the quote string, if any, used to encapsulate each field in a line to delimited text.
928   *
929   * @return a String
930   */
931  @Property(name = "quote", visibility = Visibility.PUBLIC)
932  @PropertyDescription("The string used for quoting.")
933  public String getQuote()
934    {
935    return delimitedParser.getQuote();
936    }
937
938  @Override
939  public boolean isSymmetrical()
940    {
941    return super.isSymmetrical() && skipHeader == writeHeader;
942    }
943
944  @Override
945  public void setSinkFields( Fields sinkFields )
946    {
947    super.setSourceFields( sinkFields );
948    super.setSinkFields( sinkFields );
949
950    if( delimitedParser != null )
951      delimitedParser.reset( getSourceFields(), getSinkFields() );
952    }
953
954  @Override
955  public void setSourceFields( Fields sourceFields )
956    {
957    super.setSourceFields( sourceFields );
958    super.setSinkFields( sourceFields );
959
960    if( delimitedParser != null )
961      delimitedParser.reset( getSourceFields(), getSinkFields() );
962    }
963
964  @Override
965  public Fields retrieveSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap )
966    {
967    if( !skipHeader || !getSourceFields().isUnknown() )
968      return getSourceFields();
969
970    // no need to open them all
971    if( tap instanceof CompositeTap )
972      tap = (Tap) ( (CompositeTap) tap ).getChildTaps().next();
973
974    // should revert to file:// (Lfs) if tap is Lfs
975    if( tap instanceof TapWith )
976      tap = ( (TapWith) tap ).withScheme( new TextLine( new Fields( "line" ), charsetName ) ).asTap();
977    else
978      tap = new Hfs( new TextLine( new Fields( "line" ), charsetName ), tap.getFullIdentifier( flowProcess ) );
979
980    setSourceFields( delimitedParser.parseFirstLine( flowProcess, tap ) );
981
982    return getSourceFields();
983    }
984
985  @Override
986  public void presentSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields )
987    {
988    presentSourceFieldsInternal( fields );
989    }
990
991  @Override
992  public void presentSinkFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields )
993    {
994    presentSinkFieldsInternal( fields );
995    }
996
997  @Override
998  public void sourcePrepare( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall )
999    {
1000    super.sourcePrepare( flowProcess, sourceCall );
1001
1002    sourceCall.getIncomingEntry().setTuple( TupleViews.createObjectArray() );
1003    }
1004
1005  @Override
1006  public boolean source( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) throws IOException
1007    {
1008    Object[] context = sourceCall.getContext();
1009
1010    if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) )
1011      return false;
1012
1013    if( skipHeader && ( (LongWritable) context[ 0 ] ).get() == 0 )
1014      {
1015      if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) )
1016        return false;
1017      }
1018
1019    // delegate coercion to delimitedParser for robustness
1020    Object[] split = delimitedParser.parseLine( makeEncodedString( context ) );
1021    Tuple tuple = sourceCall.getIncomingEntry().getTuple();
1022
1023    TupleViews.reset( tuple, split );
1024
1025    return true;
1026    }
1027
1028  @Override
1029  public void sinkPrepare( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
1030    {
1031    sinkCall.setContext( new Object[ 3 ] );
1032
1033    sinkCall.getContext()[ 0 ] = new Text();
1034    sinkCall.getContext()[ 1 ] = new StringBuilder( 4 * 1024 );
1035    sinkCall.getContext()[ 2 ] = Charset.forName( charsetName );
1036
1037    if( writeHeader )
1038      writeHeader( sinkCall );
1039    }
1040
1041  protected void writeHeader( SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
1042    {
1043    Fields fields = sinkCall.getOutgoingEntry().getFields();
1044
1045    Text text = (Text) sinkCall.getContext()[ 0 ];
1046    StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ];
1047    Charset charset = (Charset) sinkCall.getContext()[ 2 ];
1048
1049    line = (StringBuilder) delimitedParser.joinFirstLine( fields, line );
1050
1051    text.set( line.toString().getBytes( charset ) );
1052
1053    sinkCall.getOutput().collect( null, text );
1054
1055    line.setLength( 0 );
1056    }
1057
1058  @Override
1059  public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
1060    {
1061    TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
1062
1063    Text text = (Text) sinkCall.getContext()[ 0 ];
1064    StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ];
1065    Charset charset = (Charset) sinkCall.getContext()[ 2 ];
1066
1067    Iterable<String> strings = tupleEntry.asIterableOf( String.class );
1068
1069    line = (StringBuilder) delimitedParser.joinLine( strings, line );
1070
1071    text.set( line.toString().getBytes( charset ) );
1072
1073    sinkCall.getOutput().collect( null, text );
1074
1075    line.setLength( 0 );
1076    }
1077
1078  @Override
1079  public String getExtension()
1080    {
1081    switch( getDelimiter().trim() )
1082      {
1083      case "\t":
1084        return "tsv";
1085
1086      case ",":
1087        return "csv";
1088      }
1089
1090    return "txt";
1091    }
1092  }
1093