package org.tribuo.util.tokens.impl.wordpiece;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import org.tribuo.util.tokens.impl.SplitFunctionTokenizer;

/* loaded from: input_file:org/tribuo/util/tokens/impl/wordpiece/WordpieceBasicTokenizer.class */
public class WordpieceBasicTokenizer extends SplitFunctionTokenizer {

    @Config(description = "split on Chinese tokens?")
    private boolean tokenizeChineseChars;

    public static SplitFunctionTokenizer.SplitFunction createSplitFunction(boolean z) {
        return (i, i2, charSequence) -> {
            if (!Character.isWhitespace(i) && i != 160) {
                return isPunctuation(i) ? SplitFunctionTokenizer.SplitResult.SPLIT_BEFORE_AND_AFTER_PUNCTUATION : (z && isChinese(i)) ? SplitFunctionTokenizer.SplitResult.SPLIT_BEFORE_AND_AFTER_WORD : (i == 0 || i == 65533 || isControl(i)) ? SplitFunctionTokenizer.SplitResult.SPLIT_AT : SplitFunctionTokenizer.SplitResult.NO_SPLIT_WORD;
            }
            return SplitFunctionTokenizer.SplitResult.SPLIT_AT;
        };
    }

    public static boolean isPunctuation(int i) {
        int type;
        if (i >= 33 && i <= 47) {
            return true;
        }
        if (i >= 58 && i <= 64) {
            return true;
        }
        if (i < 91 || i > 96) {
            return (i >= 123 && i <= 126) || (type = Character.getType(i)) == 20 || type == 21 || type == 22 || type == 23 || type == 24 || type == 29 || type == 30;
        }
        return true;
    }

    public static boolean isChinese(int i) {
        if (i >= 19968 && i <= 40959) {
            return true;
        }
        if (i >= 13312 && i <= 19903) {
            return true;
        }
        if (i >= 131072 && i <= 173791) {
            return true;
        }
        if (i >= 173824 && i <= 177983) {
            return true;
        }
        if (i >= 177984 && i <= 178207) {
            return true;
        }
        if (i >= 178208 && i <= 183983) {
            return true;
        }
        if (i < 63744 || i > 64255) {
            return i >= 194560 && i <= 195103;
        }
        return true;
    }

    public static boolean isControl(int i) {
        char c = Character.toChars(i)[0];
        if (c == '\t' || c == '\n' || c == '\r') {
            return false;
        }
        int type = Character.getType(i);
        return type == 15 || type == 16 || type == 18 || type == 19;
    }

    public WordpieceBasicTokenizer() {
        this.tokenizeChineseChars = true;
        postConfig();
    }

    public WordpieceBasicTokenizer(boolean z) {
        this.tokenizeChineseChars = true;
        this.tokenizeChineseChars = z;
        postConfig();
    }

    public void postConfig() {
        this.splitFunction = createSplitFunction(this.tokenizeChineseChars);
    }

    /* renamed from: getProvenance, reason: merged with bridge method [inline-methods] */
    public ConfiguredObjectProvenance m16getProvenance() {
        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
    }

    @Override // org.tribuo.util.tokens.impl.SplitFunctionTokenizer
    /* renamed from: clone */
    public WordpieceBasicTokenizer mo9clone() {
        return new WordpieceBasicTokenizer(this.tokenizeChineseChars);
    }
}
