package org.tribuo.util.tokens.impl.wordpiece;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.tribuo.util.tokens.Token;
import org.tribuo.util.tokens.Tokenizer;
import org.tribuo.util.tokens.impl.WhitespaceTokenizer;

/* loaded from: input_file:org/tribuo/util/tokens/impl/wordpiece/WordpieceTokenizer.class */
public class WordpieceTokenizer implements Tokenizer {
    private static final Pattern accentsPattern = Pattern.compile("\\p{Mn}");

    @Config(mandatory = true, description = "an instance of Wordpiece which applies the 'wordpiece' algorithm")
    private Wordpiece wordpiece;

    @Config(description = "determines whether or not to lowercase the input text")
    private boolean toLowerCase;

    @Config(description = "performs whitespace tokenization before 'basic' tokenizer is applied (see basicTokenizer)")
    private Tokenizer whitespaceTokenizer;

    @Config(description = "performs some tokenization work on the input text before the wordpiece algorithm is applied to each resulting token.")
    private Tokenizer basicTokenizer;

    @Config(description = "determines whether or not to strip accents/diacritics from the input text")
    private boolean stripAccents;

    @Config(description = "a set of 'token' strings that should never be split regardless of whether they have e.g., punctuation in the middle.  No entries should have whitespace in them.")
    private Set<String> neverSplitTokens;
    private boolean reset;
    private Token currentToken;
    private List<Token> currentWordpieceTokens;
    private int currentWordpieceIndex;

    private WordpieceTokenizer() {
        this.toLowerCase = true;
        this.whitespaceTokenizer = new WhitespaceTokenizer();
        this.basicTokenizer = new WordpieceBasicTokenizer();
        this.stripAccents = true;
        this.neverSplitTokens = Collections.emptySet();
        this.currentWordpieceTokens = new ArrayList();
    }

    public WordpieceTokenizer(Wordpiece wordpiece, Tokenizer tokenizer, boolean z, boolean z2, Set<String> set) {
        this.toLowerCase = true;
        this.whitespaceTokenizer = new WhitespaceTokenizer();
        this.basicTokenizer = new WordpieceBasicTokenizer();
        this.stripAccents = true;
        this.neverSplitTokens = Collections.emptySet();
        this.currentWordpieceTokens = new ArrayList();
        this.wordpiece = wordpiece;
        this.basicTokenizer = tokenizer;
        this.toLowerCase = z;
        this.stripAccents = z2;
        this.neverSplitTokens = set;
    }

    /* renamed from: getProvenance, reason: merged with bridge method [inline-methods] */
    public ConfiguredObjectProvenance m19getProvenance() {
        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public void reset(CharSequence charSequence) {
        this.reset = true;
        this.whitespaceTokenizer.reset(charSequence);
        this.currentWordpieceTokens.clear();
        this.currentWordpieceIndex = -1;
        if (this.whitespaceTokenizer.advance()) {
            this.currentToken = this.whitespaceTokenizer.getToken();
            getWordpieceTokens();
        }
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public boolean advance() {
        if (!this.reset) {
            throw new IllegalStateException("WordpieceTokenizer has not been reset.");
        }
        this.currentWordpieceIndex++;
        if (this.currentWordpieceIndex < this.currentWordpieceTokens.size()) {
            return true;
        }
        if (!this.whitespaceTokenizer.advance()) {
            return false;
        }
        this.currentToken = this.whitespaceTokenizer.getToken();
        getWordpieceTokens();
        this.currentWordpieceIndex = 0;
        if (this.currentWordpieceTokens.size() == 0) {
            return advance();
        }
        return true;
    }

    private static String normalize(String str) {
        return accentsPattern.matcher(Normalizer.normalize(str, Normalizer.Form.NFD)).replaceAll("");
    }

    private void getWordpieceTokens() {
        this.currentWordpieceTokens.clear();
        String str = this.currentToken.text;
        if (this.neverSplitTokens.contains(str)) {
            this.currentWordpieceTokens.add(this.currentToken);
            return;
        }
        for (Token token : this.basicTokenizer.tokenize(str)) {
            String str2 = token.text;
            if (this.toLowerCase) {
                str2 = str2.toLowerCase();
            }
            if (this.stripAccents) {
                str2 = normalize(str2);
            }
            List<String> wordpiece = this.wordpiece.wordpiece(str2);
            if (wordpiece.size() == 0) {
                return;
            }
            if (wordpiece.size() == 1) {
                String str3 = wordpiece.get(0);
                int i = token.start + this.currentToken.start;
                int i2 = token.end + this.currentToken.start;
                if (str3.equals(this.wordpiece.getUnknownToken())) {
                    this.currentWordpieceTokens.add(new Token(str3, i, i2, Token.TokenType.UNKNOWN));
                } else {
                    this.currentWordpieceTokens.add(new Token(str3, i, i2, Token.TokenType.WORD));
                }
            } else {
                int i3 = this.currentToken.start + token.start;
                for (String str4 : wordpiece) {
                    Token.TokenType tokenType = Token.TokenType.PREFIX;
                    int length = i3 + str4.length();
                    if (str4.startsWith("##")) {
                        length -= 2;
                        tokenType = Token.TokenType.SUFFIX;
                    }
                    this.currentWordpieceTokens.add(new Token(str4, i3, length, tokenType));
                    i3 = length;
                }
            }
        }
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public Token getToken() {
        if (this.currentWordpieceIndex < this.currentWordpieceTokens.size()) {
            return this.currentWordpieceTokens.get(this.currentWordpieceIndex);
        }
        throw new IllegalStateException("WordpieceTokenizer is not ready.");
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public String getText() {
        return getToken().text;
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public int getStart() {
        return getToken().start;
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public int getEnd() {
        return getToken().end;
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public Token.TokenType getType() {
        return getToken().type;
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    /* renamed from: clone, reason: merged with bridge method [inline-methods] */
    public WordpieceTokenizer m18clone() {
        try {
            WordpieceTokenizer wordpieceTokenizer = (WordpieceTokenizer) super.clone();
            wordpieceTokenizer.whitespaceTokenizer = this.whitespaceTokenizer.m18clone();
            wordpieceTokenizer.basicTokenizer = this.basicTokenizer.m18clone();
            wordpieceTokenizer.reset = false;
            wordpieceTokenizer.currentToken = null;
            wordpieceTokenizer.currentWordpieceTokens.clear();
            wordpieceTokenizer.currentWordpieceIndex = -1;
            return wordpieceTokenizer;
        } catch (CloneNotSupportedException e) {
            throw new AssertionError("WordpieceTokenizer is Cloneable, but clone call failed");
        }
    }
}
