001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading.local.tap.aws.s3; 022 023import java.io.DataOutputStream; 024import java.io.FilterInputStream; 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.OutputStream; 028import java.net.URI; 029import java.net.URISyntaxException; 030import java.util.ArrayList; 031import java.util.Iterator; 032import java.util.List; 033import java.util.Properties; 034import java.util.function.Predicate; 035import java.util.regex.Pattern; 036 037import cascading.flow.FlowProcess; 038import cascading.property.PropertyUtil; 039import cascading.scheme.FileFormat; 040import cascading.scheme.Scheme; 041import cascading.tap.SinkMode; 042import cascading.tap.Tap; 043import cascading.tap.TapException; 044import cascading.tap.local.PartitionTap; 045import cascading.tap.type.FileType; 046import cascading.tap.type.TapWith; 047import cascading.tuple.TupleEntryCollector; 048import cascading.tuple.TupleEntryIterator; 049import cascading.tuple.TupleEntrySchemeCollector; 050import cascading.tuple.TupleEntrySchemeIterator; 051import cascading.util.CloseableIterator; 052import cascading.util.Util; 053import com.amazonaws.ClientConfiguration; 054import com.amazonaws.SdkClientException; 055import com.amazonaws.client.builder.AwsClientBuilder; 056import com.amazonaws.event.ProgressEvent; 057import com.amazonaws.event.ProgressEventType; 058import com.amazonaws.services.s3.AmazonS3; 059import com.amazonaws.services.s3.AmazonS3ClientBuilder; 060import com.amazonaws.services.s3.Headers; 061import com.amazonaws.services.s3.model.AmazonS3Exception; 062import com.amazonaws.services.s3.model.ObjectMetadata; 063import com.amazonaws.services.s3.model.PutObjectRequest; 064import com.amazonaws.services.s3.model.S3ObjectSummary; 065import com.amazonaws.services.s3.transfer.PersistableTransfer; 066import com.amazonaws.services.s3.transfer.Transfer; 067import com.amazonaws.services.s3.transfer.TransferManager; 068import com.amazonaws.services.s3.transfer.TransferManagerBuilder; 069import com.amazonaws.services.s3.transfer.Upload; 070import com.amazonaws.services.s3.transfer.internal.S3ProgressListener; 071import com.amazonaws.services.s3.transfer.model.UploadResult; 072import com.google.common.io.ByteSource; 073import com.google.common.io.FileBackedOutputStream; 074import org.slf4j.Logger; 075import org.slf4j.LoggerFactory; 076 077import static cascading.util.Util.isEmpty; 078 079/** 080 * Class S3Tap is a Cascading local-mode {@link Tap} providing read and write access to data stored in Amazon S3 buckets. 081 * <p> 082 * This Tap is not intended to be used with any of the other Cascading planners unless they specify they are local-mode 083 * compatible. 084 * <p> 085 * S3Tap can read a single key, all objects underneath a key-prefix, or all objects under a key-prefix that match 086 * a given globbing pattern. 087 * <p> 088 * See the various constructors for the available access parametrizations. Of note are the constructors that take 089 * a {@link URI} instance. The URI should be in the following format: 090 * {@code s3://[bucket]/<key|key-prefix><?glob>} 091 * <p> 092 * Where bucket is the only required value. The key references a single object, the key-prefix is used to access 093 * a set of objects with a common prefix value. The glob value is use to further narrow the resulting object set. 094 * <p> 095 * The globbing pattern is specified by the {@link java.nio.file.FileSystem#getPathMatcher} method. 096 * <p> 097 * This Tap was designed to allow applications to effectively poll an S3 bucket for new keys to be processed. 098 * <p> 099 * When used with the {@link S3FileCheckpointer} class, a map of keys last consumed by each bucket will be tracked 100 * on disk, with the map surviving JVM restarts allowing for applications to exit and restart safely without 101 * retrieving duplicate data. 102 * <p> 103 * The {@link S3Checkpointer#commit()} method is only called during a graceful shutdown of the Flow or JVM, but every 104 * consumed key is passed to the S3Checkpointer, so custom implementations can choose to persist the key more 105 * frequently. 106 * <p> 107 * AWS Credentials are handled by {@link com.amazonaws.auth.DefaultAWSCredentialsProviderChain}. 108 */ 109public class S3Tap extends Tap<Properties, InputStream, OutputStream> implements FileType<Properties>, TapWith<Properties, InputStream, OutputStream> 110 { 111 /** Field LOG */ 112 private static final Logger LOG = LoggerFactory.getLogger( S3Tap.class ); 113 114 /** Field SEQUENCE_TOKEN */ 115 public static final String SEQUENCE_TOKEN = "{sequence}"; 116 /** Field MIME_DIRECTORY */ 117 public static final String MIME_DIRECTORY = "application/x-directory"; 118 /** Field DEFAULT_DELIMITER */ 119 public static final String DEFAULT_DELIMITER = "/"; 120 121 /** Field s3Client */ 122 AmazonS3 s3Client; 123 /** Field bucketName */ 124 String bucketName; 125 /** Field key */ 126 String key; 127 /** Field filter */ 128 Predicate<String> filter; 129 /** Field delimiter */ 130 String delimiter = DEFAULT_DELIMITER; 131 /** Field checkpointer */ 132 S3Checkpointer checkpointer; 133 134 private transient ObjectMetadata objectMetadata; 135 136 /** 137 * Method makeURI creates a new S3 URI from the given parameters. 138 * 139 * @param bucketName the S3 bucket name 140 * @param keyPrefix the S3 object key or key-prefix 141 * @return an URI instance 142 */ 143 public static URI makeURI( String bucketName, String keyPrefix ) 144 { 145 return makeURI( bucketName, keyPrefix, null ); 146 } 147 148 /** 149 * Method makeURI creates a new S3 URI from the given parameters. 150 * 151 * @param bucketName the S3 bucket name 152 * @param keyPrefix the S3 object key or key-prefix 153 * @param glob the globbing pattern to apply to the keys 154 * @return an URI instance 155 */ 156 public static URI makeURI( String bucketName, String keyPrefix, String glob ) 157 { 158 if( bucketName == null ) 159 throw new IllegalArgumentException( "bucketName may not be null" ); 160 161 try 162 { 163 if( keyPrefix == null ) 164 keyPrefix = "/"; 165 else if( !keyPrefix.startsWith( "/" ) ) 166 keyPrefix = "/" + keyPrefix; 167 168 return new URI( "s3", bucketName, keyPrefix, glob, null ); 169 } 170 catch( URISyntaxException exception ) 171 { 172 throw new IllegalArgumentException( exception.getMessage(), exception ); 173 } 174 } 175 176 /** 177 * Constructor S3Tap creates a new S3Tap instance. 178 * 179 * @param scheme of Scheme 180 * @param bucketName of String 181 */ 182 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName ) 183 { 184 this( scheme, bucketName, null, null, null, SinkMode.KEEP ); 185 } 186 187 /** 188 * Constructor S3Tap creates a new S3Tap instance. 189 * 190 * @param scheme of Scheme 191 * @param bucketName of String 192 * @param key of String 193 */ 194 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key ) 195 { 196 this( scheme, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 197 } 198 199 /** 200 * Constructor S3Tap creates a new S3Tap instance. 201 * 202 * @param scheme of Scheme 203 * @param bucketName of String 204 * @param key of String 205 * @param delimiter of String 206 */ 207 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter ) 208 { 209 this( scheme, null, null, bucketName, key, delimiter, SinkMode.KEEP ); 210 } 211 212 /** 213 * Constructor S3Tap creates a new S3Tap instance. 214 * 215 * @param scheme of Scheme 216 * @param bucketName of String 217 * @param filter of Predicate 218 */ 219 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter ) 220 { 221 this( scheme, bucketName, null, filter, SinkMode.KEEP ); 222 } 223 224 /** 225 * Constructor S3Tap creates a new S3Tap instance. 226 * 227 * @param scheme of Scheme 228 * @param bucketName of String 229 * @param key of String 230 * @param filter of Predicate 231 */ 232 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter ) 233 { 234 this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP ); 235 } 236 237 /** 238 * Constructor S3Tap creates a new S3Tap instance. 239 * 240 * @param scheme of Scheme 241 * @param bucketName of String 242 * @param key of String 243 * @param delimiter of String 244 * @param filter of Predicate 245 */ 246 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter ) 247 { 248 this( scheme, null, null, bucketName, key, delimiter, filter, SinkMode.KEEP ); 249 } 250 251 /** 252 * Constructor S3Tap creates a new S3Tap instance. 253 * 254 * @param scheme of Scheme 255 * @param s3Client of AmazonS3 256 * @param bucketName of String 257 */ 258 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName ) 259 { 260 this( scheme, s3Client, bucketName, null, SinkMode.KEEP ); 261 } 262 263 /** 264 * Constructor S3Tap creates a new S3Tap instance. 265 * 266 * @param scheme of Scheme 267 * @param s3Client of AmazonS3 268 * @param bucketName of String 269 * @param key of String 270 */ 271 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key ) 272 { 273 this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 274 } 275 276 /** 277 * Constructor S3Tap creates a new S3Tap instance. 278 * 279 * @param scheme of Scheme 280 * @param s3Client of AmazonS3 281 * @param bucketName of String 282 * @param key of String 283 * @param delimiter of String 284 */ 285 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter ) 286 { 287 this( scheme, s3Client, bucketName, key, delimiter, null, SinkMode.KEEP ); 288 } 289 290 /** 291 * Constructor S3Tap creates a new S3Tap instance. 292 * 293 * @param scheme of Scheme 294 * @param s3Client of AmazonS3 295 * @param bucketName of String 296 * @param key of String 297 * @param delimiter of String 298 * @param filter of Predicate 299 */ 300 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter ) 301 { 302 this( scheme, s3Client, null, bucketName, key, delimiter, filter, SinkMode.KEEP ); 303 } 304 305 /** 306 * Constructor S3Tap creates a new S3Tap instance. 307 * 308 * @param scheme of Scheme 309 * @param checkpointer of S3Checkpointer 310 * @param bucketName of String 311 */ 312 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName ) 313 { 314 this( scheme, checkpointer, bucketName, null, null, null, SinkMode.KEEP ); 315 } 316 317 /** 318 * Constructor S3Tap creates a new S3Tap instance. 319 * 320 * @param scheme of Scheme 321 * @param checkpointer of S3Checkpointer 322 * @param bucketName of String 323 * @param key of String 324 */ 325 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key ) 326 { 327 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 328 } 329 330 /** 331 * Constructor S3Tap creates a new S3Tap instance. 332 * 333 * @param scheme of Scheme 334 * @param checkpointer of S3Checkpointer 335 * @param bucketName of String 336 * @param key of String 337 * @param delimiter of String 338 */ 339 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter ) 340 { 341 this( scheme, null, checkpointer, bucketName, key, delimiter, SinkMode.KEEP ); 342 } 343 344 /** 345 * Constructor S3Tap creates a new S3Tap instance. 346 * 347 * @param scheme of Scheme 348 * @param checkpointer of S3Checkpointer 349 * @param bucketName of String 350 * @param filter of Predicate 351 */ 352 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter ) 353 { 354 this( scheme, checkpointer, bucketName, null, filter, SinkMode.KEEP ); 355 } 356 357 /** 358 * Constructor S3Tap creates a new S3Tap instance. 359 * 360 * @param scheme of Scheme 361 * @param checkpointer of S3Checkpointer 362 * @param bucketName of String 363 * @param key of String 364 * @param filter of Predicate 365 */ 366 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter ) 367 { 368 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, SinkMode.KEEP ); 369 } 370 371 /** 372 * Constructor S3Tap creates a new S3Tap instance. 373 * 374 * @param scheme of Scheme 375 * @param checkpointer of S3Checkpointer 376 * @param bucketName of String 377 * @param key of String 378 * @param delimiter of String 379 * @param filter of Predicate 380 */ 381 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter ) 382 { 383 this( scheme, null, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP ); 384 } 385 386 /** 387 * Constructor S3Tap creates a new S3Tap instance. 388 * 389 * @param scheme of Scheme 390 * @param s3Client of AmazonS3 391 * @param checkpointer of S3Checkpointer 392 * @param bucketName of String 393 */ 394 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName ) 395 { 396 this( scheme, s3Client, checkpointer, bucketName, null, SinkMode.KEEP ); 397 } 398 399 /** 400 * Constructor S3Tap creates a new S3Tap instance. 401 * 402 * @param scheme of Scheme 403 * @param s3Client of AmazonS3 404 * @param checkpointer of S3Checkpointer 405 * @param bucketName of String 406 * @param key of String 407 */ 408 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key ) 409 { 410 this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, SinkMode.KEEP ); 411 } 412 413 /** 414 * Constructor S3Tap creates a new S3Tap instance. 415 * 416 * @param scheme of Scheme 417 * @param s3Client of AmazonS3 418 * @param checkpointer of S3Checkpointer 419 * @param bucketName of String 420 * @param key of String 421 * @param delimiter of String 422 */ 423 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter ) 424 { 425 this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, SinkMode.KEEP ); 426 } 427 428 /** 429 * Constructor S3Tap creates a new S3Tap instance. 430 * 431 * @param scheme of Scheme 432 * @param s3Client of AmazonS3 433 * @param checkpointer of S3Checkpointer 434 * @param bucketName of String 435 * @param key of String 436 * @param delimiter of String 437 * @param filter of Predicate 438 */ 439 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter ) 440 { 441 this( scheme, s3Client, checkpointer, bucketName, key, delimiter, filter, SinkMode.KEEP ); 442 } 443 444 /** 445 * Constructor S3Tap creates a new S3Tap instance. 446 * 447 * @param scheme of Scheme 448 * @param bucketName of String 449 */ 450 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, SinkMode sinkMode ) 451 { 452 this( scheme, bucketName, null, null, null, sinkMode ); 453 } 454 455 /** 456 * Constructor S3Tap creates a new S3Tap instance. 457 * 458 * @param scheme of Scheme 459 * @param bucketName of String 460 * @param key of String 461 */ 462 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, SinkMode sinkMode ) 463 { 464 this( scheme, bucketName, key, DEFAULT_DELIMITER, sinkMode ); 465 } 466 467 /** 468 * Constructor S3Tap creates a new S3Tap instance. 469 * 470 * @param scheme of Scheme 471 * @param bucketName of String 472 * @param key of String 473 * @param delimiter of String 474 */ 475 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, SinkMode sinkMode ) 476 { 477 this( scheme, null, null, bucketName, key, delimiter, sinkMode ); 478 } 479 480 /** 481 * Constructor S3Tap creates a new S3Tap instance. 482 * 483 * @param scheme of Scheme 484 * @param bucketName of String 485 * @param filter of Predicate 486 */ 487 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, Predicate<String> filter, SinkMode sinkMode ) 488 { 489 this( scheme, bucketName, null, filter, sinkMode ); 490 } 491 492 /** 493 * Constructor S3Tap creates a new S3Tap instance. 494 * 495 * @param scheme of Scheme 496 * @param bucketName of String 497 * @param key of String 498 * @param filter of Predicate 499 */ 500 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode ) 501 { 502 this( scheme, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode ); 503 } 504 505 /** 506 * Constructor S3Tap creates a new S3Tap instance. 507 * 508 * @param scheme of Scheme 509 * @param bucketName of String 510 * @param key of String 511 * @param delimiter of String 512 * @param filter of Predicate 513 * @param sinkMode of SinkMode 514 */ 515 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 516 { 517 this( scheme, null, null, bucketName, key, delimiter, filter, sinkMode ); 518 } 519 520 /** 521 * Constructor S3Tap creates a new S3Tap instance. 522 * 523 * @param scheme of Scheme 524 * @param s3Client of AmazonS3 525 * @param bucketName of String 526 * @param sinkMode of SinkMode 527 */ 528 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, SinkMode sinkMode ) 529 { 530 this( scheme, s3Client, bucketName, null, sinkMode ); 531 } 532 533 /** 534 * Constructor S3Tap creates a new S3Tap instance. 535 * 536 * @param scheme of Scheme 537 * @param s3Client of AmazonS3 538 * @param bucketName of String 539 * @param key of String 540 * @param sinkMode of SinkMode 541 */ 542 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, SinkMode sinkMode ) 543 { 544 this( scheme, s3Client, bucketName, key, DEFAULT_DELIMITER, sinkMode ); 545 } 546 547 /** 548 * Constructor S3Tap creates a new S3Tap instance. 549 * 550 * @param scheme of Scheme 551 * @param s3Client of AmazonS3 552 * @param bucketName of String 553 * @param key of String 554 * @param delimiter of String 555 * @param sinkMode of SinkMode 556 */ 557 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, SinkMode sinkMode ) 558 { 559 this( scheme, s3Client, bucketName, key, delimiter, null, sinkMode ); 560 } 561 562 /** 563 * Constructor S3Tap creates a new S3Tap instance. 564 * 565 * @param scheme of Scheme 566 * @param s3Client of AmazonS3 567 * @param bucketName of String 568 * @param key of String 569 * @param delimiter of String 570 * @param filter of Predicate 571 * @param sinkMode of SinkMode 572 */ 573 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 574 { 575 this( scheme, s3Client, null, bucketName, key, delimiter, filter, sinkMode ); 576 } 577 578 /** 579 * Constructor S3Tap creates a new S3Tap instance. 580 * 581 * @param scheme of Scheme 582 * @param checkpointer of S3Checkpointer 583 * @param bucketName of String 584 * @param sinkMode of SinkMode 585 */ 586 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode ) 587 { 588 this( scheme, checkpointer, bucketName, null, null, null, sinkMode ); 589 } 590 591 /** 592 * Constructor S3Tap creates a new S3Tap instance. 593 * 594 * @param scheme of Scheme 595 * @param checkpointer of S3Checkpointer 596 * @param bucketName of String 597 * @param key of String 598 * @param sinkMode of SinkMode 599 */ 600 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode ) 601 { 602 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode ); 603 } 604 605 /** 606 * Constructor S3Tap creates a new S3Tap instance. 607 * 608 * @param scheme of Scheme 609 * @param checkpointer of S3Checkpointer 610 * @param bucketName of String 611 * @param key of String 612 * @param delimiter of String 613 * @param sinkMode of SinkMode 614 */ 615 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode ) 616 { 617 this( scheme, null, checkpointer, bucketName, key, delimiter, sinkMode ); 618 } 619 620 /** 621 * Constructor S3Tap creates a new S3Tap instance. 622 * 623 * @param scheme of Scheme 624 * @param checkpointer of S3Checkpointer 625 * @param bucketName of String 626 * @param filter of Predicate 627 * @param sinkMode of SinkMode 628 */ 629 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, Predicate<String> filter, SinkMode sinkMode ) 630 { 631 this( scheme, checkpointer, bucketName, null, filter, sinkMode ); 632 } 633 634 /** 635 * Constructor S3Tap creates a new S3Tap instance. 636 * 637 * @param scheme of Scheme 638 * @param checkpointer of S3Checkpointer 639 * @param bucketName of String 640 * @param key of String 641 * @param filter of Predicate 642 * @param sinkMode of SinkMode 643 */ 644 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, Predicate<String> filter, SinkMode sinkMode ) 645 { 646 this( scheme, checkpointer, bucketName, key, DEFAULT_DELIMITER, filter, sinkMode ); 647 } 648 649 /** 650 * Constructor S3Tap creates a new S3Tap instance. 651 * 652 * @param scheme of Scheme 653 * @param checkpointer of S3Checkpointer 654 * @param bucketName of String 655 * @param key of String 656 * @param delimiter of String 657 * @param filter of Predicate 658 * @param sinkMode of SinkMode 659 */ 660 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 661 { 662 this( scheme, null, checkpointer, bucketName, key, delimiter, filter, sinkMode ); 663 } 664 665 /** 666 * Constructor S3Tap creates a new S3Tap instance. 667 * 668 * @param scheme of Scheme 669 * @param s3Client of AmazonS3 670 * @param checkpointer of S3Checkpointer 671 * @param bucketName of String 672 * @param sinkMode of SinkMode 673 */ 674 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, SinkMode sinkMode ) 675 { 676 this( scheme, s3Client, checkpointer, bucketName, null, sinkMode ); 677 } 678 679 /** 680 * Constructor S3Tap creates a new S3Tap instance. 681 * 682 * @param scheme of Scheme 683 * @param s3Client of AmazonS3 684 * @param checkpointer of S3Checkpointer 685 * @param bucketName of String 686 * @param key of String 687 * @param sinkMode of SinkMode 688 */ 689 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, SinkMode sinkMode ) 690 { 691 this( scheme, s3Client, checkpointer, bucketName, key, DEFAULT_DELIMITER, sinkMode ); 692 } 693 694 /** 695 * Constructor S3Tap creates a new S3Tap instance. 696 * 697 * @param scheme of Scheme 698 * @param s3Client of AmazonS3 699 * @param checkpointer of S3Checkpointer 700 * @param bucketName of String 701 * @param key of String 702 * @param delimiter of String 703 * @param sinkMode of SinkMode 704 */ 705 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, SinkMode sinkMode ) 706 { 707 this( scheme, s3Client, checkpointer, bucketName, key, delimiter, null, sinkMode ); 708 } 709 710 /** 711 * Constructor S3Tap creates a new S3Tap instance. 712 * 713 * @param scheme of Scheme 714 * @param s3Client of AmazonS3 715 * @param checkpointer of S3Checkpointer 716 * @param bucketName of String 717 * @param key of String 718 * @param delimiter of String 719 * @param filter of Predicate 720 * @param sinkMode of SinkMode 721 */ 722 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 723 { 724 super( scheme, sinkMode ); 725 this.s3Client = s3Client; 726 this.checkpointer = checkpointer; 727 this.bucketName = bucketName; 728 729 if( isEmpty( this.bucketName ) ) 730 throw new IllegalArgumentException( "bucket name may not be null or empty" ); 731 732 this.key = key; 733 this.delimiter = delimiter; 734 this.filter = filter; 735 } 736 737 /** 738 * Constructor S3Tap creates a new S3Tap instance. 739 * 740 * @param scheme of Scheme 741 * @param identifier of URI 742 */ 743 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier ) 744 { 745 this( scheme, null, null, identifier, SinkMode.KEEP ); 746 } 747 748 /** 749 * Constructor S3Tap creates a new S3Tap instance. 750 * 751 * @param scheme of Scheme 752 * @param s3Client of AmazonS3 753 * @param identifier of URI 754 */ 755 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier ) 756 { 757 this( scheme, s3Client, null, identifier, SinkMode.KEEP ); 758 } 759 760 /** 761 * Constructor S3Tap creates a new S3Tap instance. 762 * 763 * @param scheme of Scheme 764 * @param checkpointer of S3Checkpointer 765 * @param identifier of URI 766 */ 767 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier ) 768 { 769 this( scheme, null, checkpointer, identifier, SinkMode.KEEP ); 770 } 771 772 /** 773 * Constructor S3Tap creates a new S3Tap instance. 774 * 775 * @param scheme of Scheme 776 * @param s3Client of AmazonS3 777 * @param checkpointer of S3Checkpointer 778 * @param identifier of URI 779 */ 780 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier ) 781 { 782 this( scheme, s3Client, checkpointer, identifier, SinkMode.KEEP ); 783 } 784 785 /** 786 * Constructor S3Tap creates a new S3Tap instance. 787 * 788 * @param scheme of Scheme 789 * @param identifier of URI 790 * @param sinkMode of SinkMode 791 */ 792 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, URI identifier, SinkMode sinkMode ) 793 { 794 this( scheme, null, null, identifier, sinkMode ); 795 } 796 797 /** 798 * Constructor S3Tap creates a new S3Tap instance. 799 * 800 * @param scheme of Scheme 801 * @param s3Client of AmazonS3 802 * @param identifier of URI 803 * @param sinkMode of SinkMode 804 */ 805 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier, SinkMode sinkMode ) 806 { 807 this( scheme, s3Client, null, identifier, sinkMode ); 808 } 809 810 /** 811 * Constructor S3Tap creates a new S3Tap instance. 812 * 813 * @param scheme of Scheme 814 * @param checkpointer of S3Checkpointer 815 * @param identifier of URI 816 * @param sinkMode of SinkMode 817 */ 818 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode ) 819 { 820 this( scheme, null, checkpointer, identifier, sinkMode ); 821 } 822 823 /** 824 * Constructor S3Tap creates a new S3Tap instance. 825 * 826 * @param scheme of Scheme 827 * @param s3Client of AmazonS3 828 * @param checkpointer of S3Checkpointer 829 * @param identifier of URI 830 * @param sinkMode of SinkMode 831 */ 832 public S3Tap( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, S3Checkpointer checkpointer, URI identifier, SinkMode sinkMode ) 833 { 834 super( scheme, sinkMode ); 835 this.s3Client = s3Client; 836 this.checkpointer = checkpointer; 837 838 if( identifier == null ) 839 throw new IllegalArgumentException( "identifier may not be null" ); 840 841 if( !identifier.getScheme().equalsIgnoreCase( "s3" ) ) 842 throw new IllegalArgumentException( "identifier does not have s3 scheme" ); 843 844 this.bucketName = getBucketNameFor( identifier ); 845 846 if( isEmpty( this.bucketName ) ) 847 throw new IllegalArgumentException( "bucket name may not be null or empty" + identifier ); 848 849 this.key = cleanKey( identifier ); 850 851 if( identifier.getQuery() != null ) 852 filter = globPredicate( identifier.getQuery() ); 853 } 854 855 protected String getBucketNameFor( URI identifier ) 856 { 857 String authority = identifier.getAuthority(); 858 859 if( isEmpty( authority ) ) 860 throw new IllegalArgumentException( "identifier must have an authority: " + identifier ); 861 862 int pos = authority.indexOf( '@' ); 863 864 if( pos != -1 ) 865 return authority.substring( pos + 1 ); 866 867 return authority; 868 } 869 870 private static Predicate<String> globPredicate( String glob ) 871 { 872 String regex = getRegexForGlob( glob ); 873 Pattern pattern = Pattern.compile( regex ); 874 875 return string -> pattern.matcher( string ).matches(); 876 } 877 878 private static String getRegexForGlob( String glob ) 879 { 880 return (String) Util.invokeStaticMethod( 881 "sun.nio.fs.Globs", 882 "toUnixRegexPattern", 883 new Object[]{glob}, 884 new Class[]{String.class} 885 ); 886 } 887 888 @Override 889 public TapWith<Properties, InputStream, OutputStream> withScheme( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme ) 890 { 891 // don't lazily create s3Client 892 return create( scheme, s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), getSinkMode() ); 893 } 894 895 @Override 896 public TapWith<Properties, InputStream, OutputStream> withChildIdentifier( String identifier ) 897 { 898 URI uri; 899 900 if( identifier.startsWith( "s3://" ) ) 901 uri = URI.create( identifier ); 902 else if( identifier.startsWith( getBucketName() ) ) 903 uri = makeURI( identifier, null ); 904 else 905 uri = makeURI( getBucketName(), getKey() + ( identifier.startsWith( delimiter ) ? identifier : delimiter + identifier ) ); 906 907 // don't lazily create s3Client 908 return create( getScheme(), s3Client, uri, getSinkMode() ); 909 } 910 911 protected TapWith<Properties, InputStream, OutputStream> create( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, URI identifier, SinkMode sinkMode ) 912 { 913 return new S3Tap( scheme, s3Client, identifier, sinkMode ); 914 } 915 916 @Override 917 public TapWith<Properties, InputStream, OutputStream> withSinkMode( SinkMode sinkMode ) 918 { 919 // don't lazily create s3Client 920 return create( getScheme(), s3Client, getBucketName(), getKey(), getDelimiter(), getFilter(), sinkMode ); 921 } 922 923 protected TapWith<Properties, InputStream, OutputStream> create( Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, AmazonS3 s3Client, String bucketName, String key, String delimiter, Predicate<String> filter, SinkMode sinkMode ) 924 { 925 return new S3Tap( scheme, s3Client, bucketName, key, delimiter, filter, sinkMode ); 926 } 927 928 protected String cleanKey( URI identifier ) 929 { 930 String path = identifier.normalize().getPath(); 931 932 if( path.startsWith( "/" ) ) 933 path = path.substring( 1 ); 934 935 return path; 936 } 937 938 protected AmazonS3 getS3Client( Properties properties ) 939 { 940 // return provided client 941 if( s3Client != null ) 942 return s3Client; 943 944 AmazonS3ClientBuilder standard = AmazonS3ClientBuilder.standard(); 945 946 if( properties != null ) 947 { 948 String endpoint = properties.getProperty( S3TapProps.S3_ENDPOINT ); 949 String region = properties.getProperty( S3TapProps.S3_REGION, "us-east-1" ); 950 951 if( properties.containsKey( S3TapProps.S3_PROXY_HOST ) ) 952 { 953 ClientConfiguration config = new ClientConfiguration() 954 .withProxyHost( properties.getProperty( S3TapProps.S3_PROXY_HOST ) ) 955 .withProxyPort( PropertyUtil.getIntProperty( properties, S3TapProps.S3_PROXY_PORT, -1 ) ); 956 957 standard.withClientConfiguration( config ); 958 } 959 960 if( endpoint != null ) 961 standard.withEndpointConfiguration( new AwsClientBuilder.EndpointConfiguration( endpoint, region ) ); 962 else 963 standard.setRegion( region ); 964 965 if( Boolean.parseBoolean( properties.getProperty( S3TapProps.S3_PATH_STYLE_ACCESS, "false" ) ) ) 966 standard.enablePathStyleAccess(); 967 } 968 969 return standard.build(); 970 } 971 972 /** 973 * Method getCheckpointer returns the checkpointer of this S3Tap object. 974 * 975 * @return the checkpointer (type S3Checkpointer) of this S3Tap object. 976 */ 977 public S3Checkpointer getCheckpointer() 978 { 979 return checkpointer; 980 } 981 982 /** 983 * Method getBucketName returns the bucketName of this S3Tap object. 984 * 985 * @return the bucketName (type String) of this S3Tap object. 986 */ 987 public String getBucketName() 988 { 989 return bucketName; 990 } 991 992 /** 993 * Method getKey returns the key of this S3Tap object. 994 * 995 * @return the key (type String) of this S3Tap object. 996 */ 997 public String getKey() 998 { 999 return key; 1000 } 1001 1002 protected String getMarker() 1003 { 1004 if( checkpointer != null ) 1005 return checkpointer.getLastKey( getBucketName() ); 1006 1007 return null; 1008 } 1009 1010 protected void setLastMarker( String marker ) 1011 { 1012 if( checkpointer != null ) 1013 checkpointer.setLastKey( getBucketName(), marker ); 1014 } 1015 1016 protected void commitMarker() 1017 { 1018 if( checkpointer != null ) 1019 checkpointer.commit(); 1020 } 1021 1022 /** 1023 * Method getFilter returns the filter of this S3Tap object. 1024 * 1025 * @return the filter (type Predicate) of this S3Tap object. 1026 */ 1027 public Predicate<String> getFilter() 1028 { 1029 return filter; 1030 } 1031 1032 /** 1033 * Method getDelimiter returns the delimiter of this S3Tap object. 1034 * 1035 * @return the delimiter (type String) of this S3Tap object. 1036 */ 1037 public String getDelimiter() 1038 { 1039 return delimiter; 1040 } 1041 1042 @Override 1043 public String getIdentifier() 1044 { 1045 return makeStringIdentifier( getBucketName(), getKey() ); 1046 } 1047 1048 @Override 1049 public String getFullIdentifier( Properties conf ) 1050 { 1051 return getIdentifier(); 1052 } 1053 1054 @Override 1055 public boolean deleteResource( Properties conf ) throws IOException 1056 { 1057 AmazonS3 s3Client = getS3Client( conf ); 1058 1059 try 1060 { 1061 s3Client.deleteObject( getBucketName(), getKey() ); 1062 } 1063 catch( AmazonS3Exception exception ) 1064 { 1065 throw handleException( s3Client, exception ); 1066 } 1067 1068 return true; 1069 } 1070 1071 @Override 1072 public boolean createResource( Properties conf ) throws IOException 1073 { 1074 AmazonS3 s3Client = getS3Client( conf ); 1075 1076 try 1077 { 1078 s3Client.putObject( getBucketName(), getKey(), "" ); 1079 } 1080 catch( AmazonS3Exception exception ) 1081 { 1082 throw handleException( s3Client, exception ); 1083 } 1084 1085 return true; 1086 } 1087 1088 protected ObjectMetadata getObjectMetadata( Properties conf ) 1089 { 1090 try 1091 { 1092 if( objectMetadata == null ) 1093 objectMetadata = getS3Client( conf ).getObjectMetadata( getBucketName(), getKey() ); 1094 1095 return objectMetadata; 1096 } 1097 catch( AmazonS3Exception exception ) 1098 { 1099 throw handleException( getS3Client( conf ), exception ); 1100 } 1101 } 1102 1103 private class CheckedFilterInputStream extends FilterInputStream 1104 { 1105 public CheckedFilterInputStream( InputStream inputStream ) 1106 { 1107 super( inputStream ); 1108 } 1109 } 1110 1111 @Override 1112 public TupleEntryIterator openForRead( FlowProcess<? extends Properties> flowProcess, InputStream input ) throws IOException 1113 { 1114 AmazonS3 s3Client = getS3Client( flowProcess.getConfig() ); 1115 1116 final String[] identifier = new String[ 1 ]; 1117 1118 CloseableIterator<InputStream> iterator = new CloseableIterator<InputStream>() 1119 { 1120 S3Iterable iterable = S3Iterable.iterable( s3Client, getBucketName(), getKey() ) 1121 .withFilter( getFilter() ) 1122 .withMarker( getMarker() ); 1123 1124 Iterator<S3ObjectSummary> iterator = iterable.iterator(); 1125 InputStream lastInputStream; 1126 1127 @Override 1128 public boolean hasNext() 1129 { 1130 return iterator.hasNext(); 1131 } 1132 1133 @Override 1134 public InputStream next() 1135 { 1136 safeClose(); 1137 1138 S3ObjectSummary objectSummary = iterator.next(); 1139 1140 identifier[ 0 ] = makeStringIdentifier( objectSummary.getBucketName(), objectSummary.getKey() ); 1141 1142 flowProcess.getFlowProcessContext().setSourcePath( identifier[ 0 ] ); 1143 1144 if( LOG.isDebugEnabled() ) 1145 LOG.debug( "s3 retrieving: {}/{}, with size: {}", objectSummary.getBucketName(), objectSummary.getKey(), objectSummary.getSize() ); 1146 1147 // getObject does not seem to fill the InputStream, nor does the InputStream support marking 1148 // may make sense to wrap this iterator in a iterate ahead iterator that attempts to pre-fetch objects in a different thread 1149 lastInputStream = new CheckedFilterInputStream( s3Client.getObject( objectSummary.getBucketName(), objectSummary.getKey() ).getObjectContent() ) 1150 { 1151 @Override 1152 public void close() throws IOException 1153 { 1154 setLastMarker( objectSummary.getKey() ); 1155 super.close(); 1156 } 1157 }; 1158 1159 return lastInputStream; 1160 } 1161 1162 private void safeClose() 1163 { 1164 try 1165 { 1166 if( lastInputStream != null ) 1167 lastInputStream.close(); 1168 1169 lastInputStream = null; 1170 } 1171 catch( IOException exception ) 1172 { 1173 // do nothing 1174 } 1175 } 1176 1177 @Override 1178 public void close() 1179 { 1180 safeClose(); 1181 commitMarker(); 1182 } 1183 }; 1184 1185 return new TupleEntrySchemeIterator<Properties, InputStream>( flowProcess, this, getScheme(), iterator, () -> identifier[ 0 ] ); 1186 } 1187 1188 @Override 1189 public TupleEntryCollector openForWrite( FlowProcess<? extends Properties> flowProcess, OutputStream outputStream ) throws IOException 1190 { 1191 AmazonS3 s3Client = getS3Client( flowProcess.getConfig() ); 1192 1193 if( !s3Client.doesBucketExistV2( getBucketName() ) ) 1194 s3Client.createBucket( getBucketName() ); 1195 1196 final String key = resolveKey( flowProcess, getKey() ); 1197 1198 FileBackedOutputStream fileBackedOutputStream = new FileBackedOutputStream( 512_000, true ); 1199 DataOutputStream dataOutputStream = new DataOutputStream( fileBackedOutputStream ); 1200 ByteSource byteSource = fileBackedOutputStream.asByteSource(); 1201 1202 TransferManager transferManager = TransferManagerBuilder.standard().withS3Client( s3Client ).build(); 1203 1204 final String loggableIdentifier = makeStringIdentifier( getBucketName(), key ); 1205 1206 return new TupleEntrySchemeCollector<Properties, OutputStream>( flowProcess, this, getScheme(), dataOutputStream, loggableIdentifier ) 1207 { 1208 @Override 1209 public void close() 1210 { 1211 super.close(); // flushes and closes output 1212 1213 LOG.info( "s3 starting async upload: {}", loggableIdentifier ); 1214 1215 InputStream inputStream = openInputStream( byteSource, loggableIdentifier ); 1216 1217 try 1218 { 1219 ObjectMetadata metadata = new ObjectMetadata(); 1220 metadata.setHeader( Headers.CONTENT_LENGTH, (long) dataOutputStream.size() ); 1221 1222 Upload upload = createUpload( key, transferManager, new PutObjectRequest( getBucketName(), key, inputStream, metadata ) ); 1223 1224 UploadResult uploadResult = upload.waitForUploadResult(); // never return null, throws an exception 1225 1226 handleResult( upload, uploadResult, loggableIdentifier ); 1227 } 1228 catch( SdkClientException exception ) 1229 { 1230 LOG.error( "s3 upload failed on: " + loggableIdentifier, exception ); 1231 throw new TapException( "s3 upload failed on: " + loggableIdentifier, exception ); 1232 } 1233 catch( InterruptedException exception ) 1234 { 1235 // ignore 1236 } 1237 finally 1238 { 1239 transferManager.shutdownNow( false ); 1240 } 1241 } 1242 }; 1243 } 1244 1245 protected void handleResult( Upload upload, UploadResult uploadResult, String loggableIdentifier ) 1246 { 1247 Transfer.TransferState state = upload.getState(); 1248 1249 if( state == Transfer.TransferState.Canceled ) 1250 { 1251 LOG.warn( "s3 canceled upload: {}, with key: {}", getIdentifier(), uploadResult.getKey() ); 1252 } 1253 else if( state == Transfer.TransferState.Failed ) // can this happen? 1254 { 1255 LOG.error( "s3 failed upload: {}, with key: {}", getIdentifier(), uploadResult.getKey() ); 1256 throw new TapException( "s3 upload failed on: " + loggableIdentifier ); 1257 } 1258 else 1259 { 1260 LOG.info( "s3 completed upload: {}, with key: {}", getIdentifier(), uploadResult.getKey() ); 1261 } 1262 } 1263 1264 protected InputStream openInputStream( ByteSource byteSource, String loggableIdentifier ) 1265 { 1266 InputStream inputStream; 1267 try 1268 { 1269 inputStream = byteSource.openBufferedStream(); 1270 } 1271 catch( IOException exception ) 1272 { 1273 LOG.error( "s3 upload failed on: " + loggableIdentifier, exception ); 1274 throw new TapException( "s3 upload failed on: " + loggableIdentifier, exception ); 1275 } 1276 1277 return inputStream; 1278 } 1279 1280 protected Upload createUpload( String key, TransferManager transferManager, PutObjectRequest request ) 1281 { 1282 return transferManager.upload( request, new S3ProgressListener() 1283 { 1284 @Override 1285 public void onPersistableTransfer( PersistableTransfer persistableTransfer ) 1286 { 1287 if( LOG.isDebugEnabled() ) 1288 LOG.debug( "s3 for: {}, persistable transfer: {}", key, persistableTransfer ); 1289 } 1290 1291 @Override 1292 public void progressChanged( ProgressEvent progressEvent ) 1293 { 1294 if( progressEvent.getEventType() == ProgressEventType.TRANSFER_FAILED_EVENT ) 1295 LOG.error( "s3 for: {}, event: {}", key, progressEvent ); 1296 if( progressEvent.getEventType() == ProgressEventType.TRANSFER_CANCELED_EVENT ) 1297 LOG.warn( "s3 for: {}, event: {}", key, progressEvent ); 1298 if( progressEvent.getEventType() == ProgressEventType.TRANSFER_PART_FAILED_EVENT ) 1299 LOG.warn( "s3 for: {}, event: {}", key, progressEvent ); 1300 else if( LOG.isDebugEnabled() ) 1301 LOG.debug( "s3 for: {}, event: {}", key, progressEvent ); 1302 } 1303 } ); 1304 } 1305 1306 protected String resolveKey( FlowProcess<? extends Properties> flowProcess, String key ) 1307 { 1308 int partNum = flowProcess.getIntegerProperty( PartitionTap.PART_NUM_PROPERTY, 0 ); 1309 1310 key = key.replace( SEQUENCE_TOKEN, String.format( "%05d", partNum ) ); 1311 1312 if( getScheme() instanceof FileFormat ) 1313 return key + "." + ( (FileFormat) getScheme() ).getExtension(); 1314 1315 return key; 1316 } 1317 1318 @Override 1319 public boolean resourceExists( Properties conf ) throws IOException 1320 { 1321 AmazonS3 s3Client = getS3Client( conf ); 1322 1323 try 1324 { 1325 if( getKey() == null ) 1326 return s3Client.doesBucketExistV2( getBucketName() ); 1327 1328 return s3Client.doesObjectExist( getBucketName(), getKey() ); 1329 } 1330 catch( AmazonS3Exception exception ) 1331 { 1332 throw handleException( s3Client, exception ); 1333 } 1334 } 1335 1336 protected AmazonS3Exception handleException( AmazonS3 s3Client, AmazonS3Exception exception ) 1337 { 1338 if( exception.getStatusCode() == 400 ) 1339 { 1340 LOG.error( "s3 request failed, try changing the AWS Region from: {}, using property: {}", s3Client.getRegionName(), S3TapProps.S3_REGION, exception ); 1341 } 1342 1343 return exception; 1344 } 1345 1346 @Override 1347 public long getModifiedTime( Properties conf ) throws IOException 1348 { 1349 return getObjectMetadata( conf ).getLastModified().getTime(); 1350 } 1351 1352 @Override 1353 public boolean isDirectory( FlowProcess<? extends Properties> flowProcess ) throws IOException 1354 { 1355 return MIME_DIRECTORY.equalsIgnoreCase( getObjectMetadata( flowProcess.getConfig() ).getContentType() ); 1356 } 1357 1358 @Override 1359 public boolean isDirectory( Properties conf ) throws IOException 1360 { 1361 return isDirectory( FlowProcess.nullFlowProcess() ); 1362 } 1363 1364 @Override 1365 public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess ) throws IOException 1366 { 1367 return getChildIdentifiers( flowProcess.getConfig() ); 1368 } 1369 1370 @Override 1371 public String[] getChildIdentifiers( Properties conf ) throws IOException 1372 { 1373 return getChildIdentifiers( conf, 1, false ); 1374 } 1375 1376 @Override 1377 public String[] getChildIdentifiers( FlowProcess<? extends Properties> flowProcess, int depth, boolean fullyQualified ) throws IOException 1378 { 1379 return getChildIdentifiers( flowProcess.getConfig(), depth, fullyQualified ); 1380 } 1381 1382 @Override 1383 public String[] getChildIdentifiers( Properties conf, int depth, boolean fullyQualified ) throws IOException 1384 { 1385 if( !resourceExists( conf ) ) 1386 return new String[ 0 ]; 1387 1388 S3Iterable objects = S3Iterable.iterable( getS3Client( conf ), getBucketName(), getKey() ) 1389 .withDelimiter( getDelimiter() ) 1390 .withMaxDepth( depth ) 1391 .withFilter( getFilter() ) 1392 .withMarker( getMarker() ); 1393 1394 Iterator<S3ObjectSummary> iterator = objects.iterator(); 1395 1396 List<String> results = new ArrayList<>(); 1397 1398 while( iterator.hasNext() ) 1399 results.add( makePath( iterator, fullyQualified ) ); 1400 1401 return results.toArray( new String[ results.size() ] ); 1402 } 1403 1404 protected String makePath( Iterator<S3ObjectSummary> iterator, boolean fullyQualified ) 1405 { 1406 String key = iterator.next().getKey(); 1407 1408 if( fullyQualified ) 1409 return makeStringIdentifier( getBucketName(), key ); 1410 1411 return key.substring( getKey().length() ); 1412 } 1413 1414 @Override 1415 public long getSize( FlowProcess<? extends Properties> flowProcess ) throws IOException 1416 { 1417 return getSize( flowProcess.getConfig() ); 1418 } 1419 1420 @Override 1421 public long getSize( Properties conf ) throws IOException 1422 { 1423 if( isDirectory( conf ) ) 1424 return 0; 1425 1426 return getObjectMetadata( conf ).getInstanceLength(); 1427 } 1428 1429 protected static String makeStringIdentifier( String bucketName, String keyPrefix ) 1430 { 1431 if( isEmpty( keyPrefix ) ) 1432 return String.format( "s3://%s/", bucketName ); 1433 1434 return String.format( "s3://%s/%s", bucketName, keyPrefix ); 1435 } 1436 }