001/* 002 * Copyright (c) 2016-2018 Chris K Wensel <chris@wensel.net>. All Rights Reserved. 003 * Copyright (c) 2007-2017 Xplenty, Inc. All Rights Reserved. 004 * 005 * Project and contact information: http://www.cascading.org/ 006 * 007 * This file is part of the Cascading project. 008 * 009 * Licensed under the Apache License, Version 2.0 (the "License"); 010 * you may not use this file except in compliance with the License. 011 * You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, software 016 * distributed under the License is distributed on an "AS IS" BASIS, 017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 018 * See the License for the specific language governing permissions and 019 * limitations under the License. 020 */ 021 022package cascading.scheme.hadoop; 023 024import java.beans.ConstructorProperties; 025import java.io.IOException; 026import java.nio.charset.Charset; 027 028import cascading.flow.FlowProcess; 029import cascading.management.annotation.Property; 030import cascading.management.annotation.PropertyDescription; 031import cascading.management.annotation.Visibility; 032import cascading.scheme.SinkCall; 033import cascading.scheme.SourceCall; 034import cascading.scheme.util.DelimitedParser; 035import cascading.tap.CompositeTap; 036import cascading.tap.Tap; 037import cascading.tap.TapException; 038import cascading.tap.hadoop.Hfs; 039import cascading.tap.type.TapWith; 040import cascading.tuple.Fields; 041import cascading.tuple.Tuple; 042import cascading.tuple.TupleEntry; 043import cascading.tuple.util.TupleViews; 044import org.apache.hadoop.conf.Configuration; 045import org.apache.hadoop.io.LongWritable; 046import org.apache.hadoop.io.Text; 047import org.apache.hadoop.mapred.OutputCollector; 048import org.apache.hadoop.mapred.RecordReader; 049 050/** 051 * Class TextDelimited is a sub-class of {@link TextLine}. It provides direct support for delimited text files, like 052 * TAB (\t) or COMMA (,) delimited files. It also optionally allows for quoted values. 053 * <p> 054 * TextDelimited may also be used to skip the "header" in a file, where the header is defined as the very first line 055 * in every input file. That is, if the byte offset of the current line from the input is zero (0), that line will 056 * be skipped. 057 * <p> 058 * It is assumed if sink/source {@code fields} is set to either {@link Fields#ALL} or {@link Fields#UNKNOWN} and 059 * {@code skipHeader} or {@code hasHeader} is {@code true}, the field names will be retrieved from the header of the 060 * file and used during planning. The header will parsed with the same rules as the body of the file. 061 * <p> 062 * By default headers are not skipped. 063 * <p> 064 * TextDelimited may also be used to write a "header" in a file. The fields names for the header are taken directly 065 * from the declared fields. Or if the declared fields are {@link Fields#ALL} or {@link Fields#UNKNOWN}, the 066 * resolved field names will be used, if any. 067 * <p> 068 * By default headers are not written. 069 * <p> 070 * If {@code hasHeaders} is set to {@code true} on a constructor, both {@code skipHeader} and {@code writeHeader} will 071 * be set to {@code true}. 072 * <p> 073 * By default this {@link cascading.scheme.Scheme} is both {@code strict} and {@code safe}. 074 * <p> 075 * Strict meaning if a line of text does not parse into the expected number of fields, this class will throw a 076 * {@link TapException}. If strict is {@code false}, then {@link Tuple} will be returned with {@code null} values 077 * for the missing fields. 078 * <p> 079 * Safe meaning if a field cannot be coerced into an expected type, a {@code null} will be used for the value. 080 * If safe is {@code false}, a {@link TapException} will be thrown. 081 * <p> 082 * Also by default, {@code quote} strings are not searched for to improve processing speed. If a file is 083 * COMMA delimited but may have COMMA's in a value, the whole value should be surrounded by the quote string, typically 084 * double quotes ({@literal "}). 085 * <p> 086 * Note all empty fields in a line will be returned as {@code null} unless coerced into a new type. 087 * <p> 088 * This Scheme may source/sink {@link Fields#ALL}, when given on the constructor the new instance will automatically 089 * default to strict == false as the number of fields parsed are arbitrary or unknown. A type array may not be given 090 * either, so all values will be returned as Strings. 091 * <p> 092 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor 093 * argument. 094 * <p> 095 * To override field and line parsing behaviors, sub-class {@link DelimitedParser} or provide a 096 * {@link cascading.scheme.util.FieldTypeResolver} implementation. 097 * <p> 098 * Note that there should be no expectation that TextDelimited, or specifically {@link DelimitedParser}, can handle 099 * all delimited and quoted combinations reliably. Attempting to do so would impair its performance and maintainability. 100 * <p> 101 * Further, it can be safely said any corrupted files will not be supported for obvious reasons. Corrupted files may 102 * result in exceptions or could cause edge cases in the underlying java regular expression engine. 103 * <p> 104 * A large part of Cascading was designed to help users cleans data. Thus the recommendation is to create Flows that 105 * are responsible for cleansing large data-sets when faced with the problem 106 * <p> 107 * DelimitedParser maybe sub-classed and extended if necessary. 108 * 109 * @see TextLine 110 */ 111public class TextDelimited extends TextLine 112 { 113 public static final String DEFAULT_CHARSET = "UTF-8"; 114 115 /** Field delimitedParser */ 116 protected final DelimitedParser delimitedParser; 117 /** Field skipHeader */ 118 private boolean skipHeader; 119 private final boolean writeHeader; 120 121 /** 122 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 123 * {@link Fields#ALL} and using TAB as the default delimiter. 124 * <p> 125 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 126 * with a {@link cascading.pipe.Checkpoint} Tap. 127 */ 128 public TextDelimited() 129 { 130 this( Fields.ALL, null, "\t", null, null ); 131 } 132 133 /** 134 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 135 * {@link Fields#ALL} and using TAB as the default delimiter. 136 * <p> 137 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 138 * with a {@link cascading.pipe.Checkpoint} Tap. 139 * 140 * @param hasHeader of type boolean 141 * @param delimiter of type String 142 */ 143 @ConstructorProperties({"hasHeader", "delimiter"}) 144 public TextDelimited( boolean hasHeader, String delimiter ) 145 { 146 this( Fields.ALL, null, hasHeader, delimiter, null, (Class[]) null ); 147 } 148 149 /** 150 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 151 * {@link Fields#ALL} and using TAB as the default delimiter. 152 * <p> 153 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 154 * with a {@link cascading.pipe.Checkpoint} Tap. 155 * 156 * @param hasHeader of type boolean 157 * @param delimiter of type String 158 * @param quote of type String 159 */ 160 @ConstructorProperties({"hasHeader", "delimiter", "quote"}) 161 public TextDelimited( boolean hasHeader, String delimiter, String quote ) 162 { 163 this( Fields.ALL, null, hasHeader, delimiter, quote, (Class[]) null ); 164 } 165 166 /** 167 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 168 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 169 * <p> 170 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 171 * with a {@link cascading.pipe.Checkpoint} Tap. 172 * 173 * @param hasHeader of type boolean 174 * @param delimitedParser of type DelimitedParser 175 */ 176 @ConstructorProperties({"hasHeader", "delimitedParser"}) 177 public TextDelimited( boolean hasHeader, DelimitedParser delimitedParser ) 178 { 179 this( Fields.ALL, null, hasHeader, hasHeader, delimitedParser ); 180 } 181 182 /** 183 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 184 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 185 * <p> 186 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 187 * with a {@link cascading.pipe.Checkpoint} Tap. 188 * <p> 189 * This constructor will set {@code skipHeader} and {@code writeHeader} values to true. 190 * 191 * @param delimitedParser of type DelimitedParser 192 */ 193 @ConstructorProperties({"delimitedParser"}) 194 public TextDelimited( DelimitedParser delimitedParser ) 195 { 196 this( Fields.ALL, null, true, true, delimitedParser ); 197 } 198 199 /** 200 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 201 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 202 * <p> 203 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 204 * with a {@link cascading.pipe.Checkpoint} Tap. 205 * 206 * @param sinkCompression of type Compress 207 * @param hasHeader of type boolean 208 * @param delimitedParser of type DelimitedParser 209 */ 210 @ConstructorProperties({"sinkCompression", "hasHeader", "delimitedParser"}) 211 public TextDelimited( Compress sinkCompression, boolean hasHeader, DelimitedParser delimitedParser ) 212 { 213 this( Fields.ALL, sinkCompression, hasHeader, hasHeader, delimitedParser ); 214 } 215 216 /** 217 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 218 * {@link Fields#ALL} and using the given delimitedParser instance for parsing. 219 * <p> 220 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 221 * with a {@link cascading.pipe.Checkpoint} Tap. 222 * <p> 223 * This constructor will set {@code skipHeader} and {@code writeHeader} values to true. 224 * 225 * @param delimitedParser of type DelimitedParser 226 */ 227 @ConstructorProperties({"sinkCompression", "delimitedParser"}) 228 public TextDelimited( Compress sinkCompression, DelimitedParser delimitedParser ) 229 { 230 this( Fields.ALL, sinkCompression, true, true, delimitedParser ); 231 } 232 233 /** 234 * Constructor TextDelimited creates a new TextDelimited instance sourcing {@link Fields#UNKNOWN}, sinking 235 * {@link Fields#ALL} and using TAB as the default delimiter. 236 * <p> 237 * Use this constructor if the source and sink fields will be resolved during planning, for example, when using 238 * with a {@link cascading.pipe.Checkpoint} Tap. 239 * 240 * @param sinkCompression of type Compress 241 * @param hasHeader of type boolean 242 * @param delimiter of type String 243 * @param quote of type String 244 */ 245 @ConstructorProperties({"sinkCompression", "hasHeader", "delimiter", "quote"}) 246 public TextDelimited( Compress sinkCompression, boolean hasHeader, String delimiter, String quote ) 247 { 248 this( Fields.ALL, sinkCompression, hasHeader, delimiter, quote, (Class[]) null ); 249 } 250 251 /** 252 * Constructor TextDelimited creates a new TextDelimited instance with TAB as the default delimiter. 253 * 254 * @param fields of type Fields 255 */ 256 @ConstructorProperties({"fields"}) 257 public TextDelimited( Fields fields ) 258 { 259 this( fields, null, "\t", null, null ); 260 } 261 262 /** 263 * Constructor TextDelimited creates a new TextDelimited instance. 264 * 265 * @param fields of type Fields 266 * @param delimiter of type String 267 */ 268 @ConstructorProperties({"fields", "delimiter"}) 269 public TextDelimited( Fields fields, String delimiter ) 270 { 271 this( fields, null, delimiter, null, null ); 272 } 273 274 /** 275 * Constructor TextDelimited creates a new TextDelimited instance. 276 * 277 * @param fields of type Fields 278 * @param hasHeader of type boolean 279 * @param delimiter of type String 280 */ 281 @ConstructorProperties({"fields", "hasHeader", "delimiter"}) 282 public TextDelimited( Fields fields, boolean hasHeader, String delimiter ) 283 { 284 this( fields, null, hasHeader, hasHeader, delimiter, null, null ); 285 } 286 287 /** 288 * Constructor TextDelimited creates a new TextDelimited instance. 289 * 290 * @param fields of type Fields 291 * @param skipHeader of type boolean 292 * @param writeHeader of type boolean 293 * @param delimiter of type String 294 */ 295 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter"}) 296 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter ) 297 { 298 this( fields, null, skipHeader, writeHeader, delimiter, null, null ); 299 } 300 301 /** 302 * Constructor TextDelimited creates a new TextDelimited instance. 303 * 304 * @param fields of type Fields 305 * @param delimiter of type String 306 * @param types of type Class[] 307 */ 308 @ConstructorProperties({"fields", "delimiter", "types"}) 309 public TextDelimited( Fields fields, String delimiter, Class[] types ) 310 { 311 this( fields, null, delimiter, null, types ); 312 } 313 314 /** 315 * Constructor TextDelimited creates a new TextDelimited instance. 316 * 317 * @param fields of type Fields 318 * @param hasHeader of type boolean 319 * @param delimiter of type String 320 * @param types of type Class[] 321 */ 322 @ConstructorProperties({"fields", "hasHeader", "delimiter", "types"}) 323 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, Class[] types ) 324 { 325 this( fields, null, hasHeader, hasHeader, delimiter, null, types ); 326 } 327 328 /** 329 * Constructor TextDelimited creates a new TextDelimited instance. 330 * 331 * @param fields of type Fields 332 * @param skipHeader of type boolean 333 * @param writeHeader of type boolean 334 * @param delimiter of type String 335 * @param types of type Class[] 336 */ 337 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "types"}) 338 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types ) 339 { 340 this( fields, null, skipHeader, writeHeader, delimiter, null, types ); 341 } 342 343 /** 344 * Constructor TextDelimited creates a new TextDelimited instance. 345 * 346 * @param fields of type Fields 347 * @param delimiter of type String 348 * @param quote of type String 349 * @param types of type Class[] 350 */ 351 @ConstructorProperties({"fields", "delimiter", "quote", "types"}) 352 public TextDelimited( Fields fields, String delimiter, String quote, Class[] types ) 353 { 354 this( fields, null, delimiter, quote, types ); 355 } 356 357 /** 358 * Constructor TextDelimited creates a new TextDelimited instance. 359 * 360 * @param fields of type Fields 361 * @param hasHeader of type boolean 362 * @param delimiter of type String 363 * @param quote of type String 364 * @param types of type Class[] 365 */ 366 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types"}) 367 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types ) 368 { 369 this( fields, null, hasHeader, hasHeader, delimiter, quote, types ); 370 } 371 372 /** 373 * Constructor TextDelimited creates a new TextDelimited instance. 374 * 375 * @param fields of type Fields 376 * @param skipHeader of type boolean 377 * @param writeHeader of type boolean 378 * @param delimiter of type String 379 * @param quote of type String 380 * @param types of type Class[] 381 */ 382 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types"}) 383 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types ) 384 { 385 this( fields, null, skipHeader, writeHeader, delimiter, quote, types ); 386 } 387 388 /** 389 * Constructor TextDelimited creates a new TextDelimited instance. 390 * 391 * @param fields of type Fields 392 * @param delimiter of type String 393 * @param quote of type String 394 * @param types of type Class[] 395 * @param safe of type boolean 396 */ 397 @ConstructorProperties({"fields", "delimiter", "quote", "types", "safe"}) 398 public TextDelimited( Fields fields, String delimiter, String quote, Class[] types, boolean safe ) 399 { 400 this( fields, null, delimiter, quote, types, safe ); 401 } 402 403 /** 404 * Constructor TextDelimited creates a new TextDelimited instance. 405 * 406 * @param fields of type Fields 407 * @param hasHeader of type boolean 408 * @param delimiter of type String 409 * @param quote of type String 410 * @param types of type Class[] 411 * @param safe of type boolean 412 */ 413 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe"}) 414 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe ) 415 { 416 this( fields, null, hasHeader, hasHeader, delimiter, quote, types, safe ); 417 } 418 419 /** 420 * Constructor TextDelimited creates a new TextDelimited instance. 421 * 422 * @param fields of type Fields 423 * @param hasHeader of type boolean 424 * @param delimiter of type String 425 * @param quote of type String 426 * @param types of type Class[] 427 * @param safe of type boolean 428 * @param charsetName of type String 429 */ 430 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote", "types", "safe", "charsetName"}) 431 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe, String charsetName ) 432 { 433 this( fields, null, hasHeader, hasHeader, delimiter, true, quote, types, safe, charsetName ); 434 } 435 436 /** 437 * Constructor TextDelimited creates a new TextDelimited instance. 438 * 439 * @param fields of type Fields 440 * @param skipHeader of type boolean 441 * @param writeHeader of type boolean 442 * @param delimiter of type String 443 * @param quote of type String 444 * @param types of type Class[] 445 * @param safe of type boolean 446 */ 447 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote", "types", "safe"}) 448 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe ) 449 { 450 this( fields, null, skipHeader, writeHeader, delimiter, quote, types, safe ); 451 } 452 453 /** 454 * Constructor TextDelimited creates a new TextDelimited instance. 455 * 456 * @param fields of type Fields 457 * @param sinkCompression of type Compress 458 * @param delimiter of type String 459 */ 460 @ConstructorProperties({"fields", "sinkCompression", "delimiter"}) 461 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter ) 462 { 463 this( fields, sinkCompression, delimiter, null, null ); 464 } 465 466 /** 467 * Constructor TextDelimited creates a new TextDelimited instance. 468 * 469 * @param fields of type Fields 470 * @param sinkCompression of type Compress 471 * @param hasHeader of type boolean 472 * @param delimiter of type String 473 */ 474 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter"}) 475 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter ) 476 { 477 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, null ); 478 } 479 480 /** 481 * Constructor TextDelimited creates a new TextDelimited instance. 482 * 483 * @param fields of type Fields 484 * @param sinkCompression of type Compress 485 * @param skipHeader of type boolean 486 * @param writeHeader of type boolean 487 * @param delimiter of type String 488 */ 489 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter"}) 490 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter ) 491 { 492 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, null ); 493 } 494 495 /** 496 * Constructor TextDelimited creates a new TextDelimited instance. 497 * 498 * @param fields of type Fields 499 * @param sinkCompression of type Compress 500 * @param delimiter of type String 501 * @param types of type Class[] 502 */ 503 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types"}) 504 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types ) 505 { 506 this( fields, sinkCompression, delimiter, null, types ); 507 } 508 509 /** 510 * Constructor TextDelimited creates a new TextDelimited instance. 511 * 512 * @param fields of type Fields 513 * @param sinkCompression of type Compress 514 * @param hasHeader of type boolean 515 * @param delimiter of type String 516 * @param types of type Class[] 517 */ 518 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types"}) 519 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types ) 520 { 521 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types ); 522 } 523 524 /** 525 * Constructor TextDelimited creates a new TextDelimited instance. 526 * 527 * @param fields of type Fields 528 * @param sinkCompression of type Compress 529 * @param skipHeader of type boolean 530 * @param writeHeader of type boolean 531 * @param delimiter of type String 532 * @param types of type Class[] 533 */ 534 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types"}) 535 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types ) 536 { 537 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types ); 538 } 539 540 /** 541 * Constructor TextDelimited creates a new TextDelimited instance. 542 * 543 * @param fields of type Fields 544 * @param sinkCompression of type Compress 545 * @param delimiter of type String 546 * @param types of type Class[] 547 * @param safe of type boolean 548 */ 549 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types", "safe"}) 550 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types, boolean safe ) 551 { 552 this( fields, sinkCompression, delimiter, null, types, safe ); 553 } 554 555 /** 556 * Constructor TextDelimited creates a new TextDelimited instance. 557 * 558 * @param fields of type Fields 559 * @param sinkCompression of type Compress 560 * @param hasHeader of type boolean 561 * @param delimiter of type String 562 * @param types of type Class[] 563 * @param safe of type boolean 564 */ 565 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe"}) 566 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe ) 567 { 568 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, null, types, safe ); 569 } 570 571 /** 572 * Constructor TextDelimited creates a new TextDelimited instance. 573 * 574 * @param fields of type Fields 575 * @param sinkCompression of type Compress 576 * @param hasHeader of type boolean 577 * @param delimiter of type String 578 * @param types of type Class[] 579 * @param safe of type boolean 580 * @param charsetName of type String 581 */ 582 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "types", "safe", "charsetName"}) 583 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, Class[] types, boolean safe, String charsetName ) 584 { 585 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, null, types, safe, charsetName ); 586 } 587 588 /** 589 * Constructor TextDelimited creates a new TextDelimited instance. 590 * 591 * @param fields of type Fields 592 * @param sinkCompression of type Compress 593 * @param skipHeader of type boolean 594 * @param writeHeader of type boolean 595 * @param delimiter of type String 596 * @param types of type Class[] 597 * @param safe of type boolean 598 */ 599 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "types", "safe"}) 600 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, Class[] types, boolean safe ) 601 { 602 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, null, types, safe ); 603 } 604 605 /** 606 * Constructor TextDelimited creates a new TextDelimited instance. 607 * 608 * @param fields of type Fields 609 * @param delimiter of type String 610 * @param quote of type String 611 */ 612 @ConstructorProperties({"fields", "delimiter", "quote"}) 613 public TextDelimited( Fields fields, String delimiter, String quote ) 614 { 615 this( fields, null, delimiter, quote ); 616 } 617 618 /** 619 * Constructor TextDelimited creates a new TextDelimited instance. 620 * 621 * @param fields of type Fields 622 * @param hasHeader of type boolean 623 * @param delimiter of type String 624 * @param quote of type String 625 */ 626 @ConstructorProperties({"fields", "hasHeader", "delimiter", "quote"}) 627 public TextDelimited( Fields fields, boolean hasHeader, String delimiter, String quote ) 628 { 629 this( fields, null, hasHeader, hasHeader, delimiter, quote ); 630 } 631 632 /** 633 * Constructor TextDelimited creates a new TextDelimited instance. 634 * 635 * @param fields of type Fields 636 * @param skipHeader of type boolean 637 * @param writeHeader of type boolean 638 * @param delimiter of type String 639 * @param quote of type String 640 */ 641 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimiter", "quote"}) 642 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, String delimiter, String quote ) 643 { 644 this( fields, null, skipHeader, writeHeader, delimiter, quote ); 645 } 646 647 /** 648 * Constructor TextDelimited creates a new TextDelimited instance. 649 * 650 * @param fields of type Fields 651 * @param sinkCompression of type Compress 652 * @param delimiter of type String 653 * @param quote of type String 654 */ 655 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote"}) 656 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote ) 657 { 658 this( fields, sinkCompression, false, false, delimiter, true, quote, null, true ); 659 } 660 661 /** 662 * Constructor TextDelimited creates a new TextDelimited instance. 663 * 664 * @param fields of type Fields 665 * @param sinkCompression of type Compress 666 * @param hasHeader of type boolean 667 * @param delimiter of type String 668 * @param quote of type String 669 */ 670 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote"}) 671 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote ) 672 { 673 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true ); 674 } 675 676 /** 677 * Constructor TextDelimited creates a new TextDelimited instance. 678 * 679 * @param fields of type Fields 680 * @param sinkCompression of type Compress 681 * @param hasHeader of type boolean 682 * @param delimiter of type String 683 * @param quote of type String 684 * @param charsetName of type String 685 */ 686 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "charsetName"}) 687 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, String charsetName ) 688 { 689 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, null, true, charsetName ); 690 } 691 692 /** 693 * Constructor TextDelimited creates a new TextDelimited instance. 694 * 695 * @param fields of type Fields 696 * @param sinkCompression of type Compress 697 * @param skipHeader of type boolean 698 * @param writeHeader of type boolean 699 * @param delimiter of type String 700 * @param quote of type String 701 */ 702 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote"}) 703 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote ) 704 { 705 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, null, true ); 706 } 707 708 /** 709 * Constructor TextDelimited creates a new TextDelimited instance. 710 * 711 * @param fields of type Fields 712 * @param sinkCompression of type Compress 713 * @param delimiter of type String 714 * @param quote of type String 715 * @param types of type Class[] 716 */ 717 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types"}) 718 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types ) 719 { 720 this( fields, sinkCompression, false, false, delimiter, true, quote, types, true ); 721 } 722 723 /** 724 * Constructor TextDelimited creates a new TextDelimited instance. 725 * 726 * @param fields of type Fields 727 * @param sinkCompression of type Compress 728 * @param hasHeader of type boolean 729 * @param delimiter of type String 730 * @param quote of type String 731 * @param types of type Class[] 732 */ 733 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types"}) 734 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types ) 735 { 736 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, true ); 737 } 738 739 /** 740 * Constructor TextDelimited creates a new TextDelimited instance. 741 * 742 * @param fields of type Fields 743 * @param sinkCompression of type Compress 744 * @param skipHeader of type boolean 745 * @param writeHeader of type boolean 746 * @param delimiter of type String 747 * @param quote of type String 748 * @param types of type Class[] 749 */ 750 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types"}) 751 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types ) 752 { 753 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, true ); 754 } 755 756 /** 757 * Constructor TextDelimited creates a new TextDelimited instance. 758 * 759 * @param fields of type Fields 760 * @param sinkCompression of type Compress 761 * @param delimiter of type String 762 * @param quote of type String 763 * @param types of type Class[] 764 * @param safe of type boolean 765 */ 766 @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types", "safe"}) 767 public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types, boolean safe ) 768 { 769 this( fields, sinkCompression, false, false, delimiter, true, quote, types, safe ); 770 } 771 772 /** 773 * Constructor TextDelimited creates a new TextDelimited instance. 774 * 775 * @param fields of type Fields 776 * @param sinkCompression of type Compress 777 * @param hasHeader of type boolean 778 * @param delimiter of type String 779 * @param quote of type String 780 * @param types of type Class[] 781 * @param safe of type boolean 782 */ 783 @ConstructorProperties({"fields", "sinkCompression", "hasHeader", "delimiter", "quote", "types", "safe"}) 784 public TextDelimited( Fields fields, Compress sinkCompression, boolean hasHeader, String delimiter, String quote, Class[] types, boolean safe ) 785 { 786 this( fields, sinkCompression, hasHeader, hasHeader, delimiter, true, quote, types, safe ); 787 } 788 789 /** 790 * Constructor TextDelimited creates a new TextDelimited instance. 791 * 792 * @param fields of type Fields 793 * @param sinkCompression of type Compress 794 * @param skipHeader of type boolean 795 * @param writeHeader of type boolean 796 * @param delimiter of type String 797 * @param quote of type String 798 * @param types of type Class[] 799 * @param safe of type boolean 800 */ 801 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "quote", "types", 802 "safe"}) 803 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, String quote, Class[] types, boolean safe ) 804 { 805 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, true, quote, types, safe ); 806 } 807 808 /** 809 * Constructor TextDelimited creates a new TextDelimited instance. 810 * 811 * @param fields of type Fields 812 * @param sinkCompression of type Compress 813 * @param skipHeader of type boolean 814 * @param delimiter of type String 815 * @param strict of type boolean 816 * @param quote of type String 817 * @param types of type Class[] 818 * @param safe of type boolean 819 */ 820 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote", 821 "types", "safe"}) 822 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe ) 823 { 824 this( fields, sinkCompression, skipHeader, writeHeader, delimiter, strict, quote, types, safe, DEFAULT_CHARSET ); 825 } 826 827 /** 828 * Constructor TextDelimited creates a new TextDelimited instance. 829 * 830 * @param fields of type Fields 831 * @param sinkCompression of type Compress 832 * @param skipHeader of type boolean 833 * @param delimiter of type String 834 * @param strict of type boolean 835 * @param quote of type String 836 * @param types of type Class[] 837 * @param safe of type boolean 838 * @param charsetName of type String 839 */ 840 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimiter", "strict", "quote", 841 "types", "safe", "charsetName"}) 842 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe, String charsetName ) 843 { 844 this( fields, sinkCompression, skipHeader, writeHeader, charsetName, new DelimitedParser( delimiter, quote, types, strict, safe ) ); 845 } 846 847 /** 848 * Constructor TextDelimited creates a new TextDelimited instance. 849 * 850 * @param fields of type Fields 851 * @param writeHeader of type boolean 852 * @param delimitedParser of type DelimitedParser 853 */ 854 @ConstructorProperties({"fields", "skipHeader", "writeHeader", "delimitedParser"}) 855 public TextDelimited( Fields fields, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser ) 856 { 857 this( fields, null, skipHeader, writeHeader, null, delimitedParser ); 858 } 859 860 /** 861 * Constructor TextDelimited creates a new TextDelimited instance. 862 * 863 * @param fields of type Fields 864 * @param hasHeader of type boolean 865 * @param delimitedParser of type DelimitedParser 866 */ 867 @ConstructorProperties({"fields", "hasHeader", "delimitedParser"}) 868 public TextDelimited( Fields fields, boolean hasHeader, DelimitedParser delimitedParser ) 869 { 870 this( fields, null, hasHeader, hasHeader, null, delimitedParser ); 871 } 872 873 /** 874 * Constructor TextDelimited creates a new TextDelimited instance. 875 * 876 * @param fields of type Fields 877 * @param writeHeader of type boolean 878 * @param delimitedParser of type DelimitedParser 879 */ 880 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "delimitedParser"}) 881 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, DelimitedParser delimitedParser ) 882 { 883 this( fields, sinkCompression, skipHeader, writeHeader, null, delimitedParser ); 884 } 885 886 /** 887 * Constructor TextDelimited creates a new TextDelimited instance. 888 * 889 * @param fields of type Fields 890 * @param sinkCompression of type Compress 891 * @param skipHeader of type boolean 892 * @param writeHeader of type boolean 893 * @param charsetName of type String 894 * @param delimitedParser of type DelimitedParser 895 */ 896 @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "writeHeader", "charsetName", "delimitedParser"}) 897 public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, boolean writeHeader, String charsetName, DelimitedParser delimitedParser ) 898 { 899 super( sinkCompression ); 900 901 this.delimitedParser = delimitedParser; 902 903 // normalizes ALL and UNKNOWN 904 setSinkFields( fields ); 905 setSourceFields( fields ); 906 907 this.skipHeader = skipHeader; 908 this.writeHeader = writeHeader; 909 910 // throws an exception if not found 911 setCharsetName( charsetName ); 912 } 913 914 /** 915 * Method getDelimiter returns the delimiter used to parse fields from the current line of text. 916 * 917 * @return a String 918 */ 919 @Property(name = "delimiter", visibility = Visibility.PUBLIC) 920 @PropertyDescription("The delimiter used to separate fields.") 921 public String getDelimiter() 922 { 923 return delimitedParser.getDelimiter(); 924 } 925 926 /** 927 * Method getQuote returns the quote string, if any, used to encapsulate each field in a line to delimited text. 928 * 929 * @return a String 930 */ 931 @Property(name = "quote", visibility = Visibility.PUBLIC) 932 @PropertyDescription("The string used for quoting.") 933 public String getQuote() 934 { 935 return delimitedParser.getQuote(); 936 } 937 938 @Override 939 public boolean isSymmetrical() 940 { 941 return super.isSymmetrical() && skipHeader == writeHeader; 942 } 943 944 @Override 945 public void setSinkFields( Fields sinkFields ) 946 { 947 super.setSourceFields( sinkFields ); 948 super.setSinkFields( sinkFields ); 949 950 if( delimitedParser != null ) 951 delimitedParser.reset( getSourceFields(), getSinkFields() ); 952 } 953 954 @Override 955 public void setSourceFields( Fields sourceFields ) 956 { 957 super.setSourceFields( sourceFields ); 958 super.setSinkFields( sourceFields ); 959 960 if( delimitedParser != null ) 961 delimitedParser.reset( getSourceFields(), getSinkFields() ); 962 } 963 964 @Override 965 public Fields retrieveSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap ) 966 { 967 if( !skipHeader || !getSourceFields().isUnknown() ) 968 return getSourceFields(); 969 970 // no need to open them all 971 if( tap instanceof CompositeTap ) 972 tap = (Tap) ( (CompositeTap) tap ).getChildTaps().next(); 973 974 // should revert to file:// (Lfs) if tap is Lfs 975 if( tap instanceof TapWith ) 976 tap = ( (TapWith) tap ).withScheme( new TextLine( new Fields( "line" ), charsetName ) ).asTap(); 977 else 978 tap = new Hfs( new TextLine( new Fields( "line" ), charsetName ), tap.getFullIdentifier( flowProcess ) ); 979 980 setSourceFields( delimitedParser.parseFirstLine( flowProcess, tap ) ); 981 982 return getSourceFields(); 983 } 984 985 @Override 986 public void presentSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields ) 987 { 988 presentSourceFieldsInternal( fields ); 989 } 990 991 @Override 992 public void presentSinkFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields ) 993 { 994 presentSinkFieldsInternal( fields ); 995 } 996 997 @Override 998 public void sourcePrepare( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) 999 { 1000 super.sourcePrepare( flowProcess, sourceCall ); 1001 1002 sourceCall.getIncomingEntry().setTuple( TupleViews.createObjectArray() ); 1003 } 1004 1005 @Override 1006 public boolean source( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) throws IOException 1007 { 1008 Object[] context = sourceCall.getContext(); 1009 1010 if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) ) 1011 return false; 1012 1013 if( skipHeader && ( (LongWritable) context[ 0 ] ).get() == 0 ) 1014 { 1015 if( !sourceCall.getInput().next( context[ 0 ], context[ 1 ] ) ) 1016 return false; 1017 } 1018 1019 // delegate coercion to delimitedParser for robustness 1020 Object[] split = delimitedParser.parseLine( makeEncodedString( context ) ); 1021 Tuple tuple = sourceCall.getIncomingEntry().getTuple(); 1022 1023 TupleViews.reset( tuple, split ); 1024 1025 return true; 1026 } 1027 1028 @Override 1029 public void sinkPrepare( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 1030 { 1031 sinkCall.setContext( new Object[ 3 ] ); 1032 1033 sinkCall.getContext()[ 0 ] = new Text(); 1034 sinkCall.getContext()[ 1 ] = new StringBuilder( 4 * 1024 ); 1035 sinkCall.getContext()[ 2 ] = Charset.forName( charsetName ); 1036 1037 if( writeHeader ) 1038 writeHeader( sinkCall ); 1039 } 1040 1041 protected void writeHeader( SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 1042 { 1043 Fields fields = sinkCall.getOutgoingEntry().getFields(); 1044 1045 Text text = (Text) sinkCall.getContext()[ 0 ]; 1046 StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ]; 1047 Charset charset = (Charset) sinkCall.getContext()[ 2 ]; 1048 1049 line = (StringBuilder) delimitedParser.joinFirstLine( fields, line ); 1050 1051 text.set( line.toString().getBytes( charset ) ); 1052 1053 sinkCall.getOutput().collect( null, text ); 1054 1055 line.setLength( 0 ); 1056 } 1057 1058 @Override 1059 public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException 1060 { 1061 TupleEntry tupleEntry = sinkCall.getOutgoingEntry(); 1062 1063 Text text = (Text) sinkCall.getContext()[ 0 ]; 1064 StringBuilder line = (StringBuilder) sinkCall.getContext()[ 1 ]; 1065 Charset charset = (Charset) sinkCall.getContext()[ 2 ]; 1066 1067 Iterable<String> strings = tupleEntry.asIterableOf( String.class ); 1068 1069 line = (StringBuilder) delimitedParser.joinLine( strings, line ); 1070 1071 text.set( line.toString().getBytes( charset ) ); 1072 1073 sinkCall.getOutput().collect( null, text ); 1074 1075 line.setLength( 0 ); 1076 } 1077 1078 @Override 1079 public String getExtension() 1080 { 1081 switch( getDelimiter().trim() ) 1082 { 1083 case "\t": 1084 return "tsv"; 1085 1086 case ",": 1087 return "csv"; 1088 } 1089 1090 return "txt"; 1091 } 1092 } 1093