package org.tribuo.util.tokens.universal;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.Queue;
import org.tribuo.util.tokens.Token;
import org.tribuo.util.tokens.Tokenizer;

/* loaded from: input_file:org/tribuo/util/tokens/universal/UniversalTokenizer.class */
public class UniversalTokenizer implements Tokenizer {
    protected int maxTokenLength;
    private boolean eofReached;
    private int pos;
    private int start;
    private boolean generateUnigrams;
    private boolean generateNgrams;
    private State state;
    private CharSequence cs;
    private char[] buffer;
    private String currToken;
    private Token.TokenType currType;
    private int currPos;
    private int startOffset;
    private int endOffset;
    private int tokenLength;
    private boolean firstToken;
    private boolean ready;

    @Config
    private boolean sendPunct;
    private Queue<Range> queuedTokens;
    private Queue<Range> pool;
    private char c;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/tribuo/util/tokens/universal/UniversalTokenizer$State.class */
    public enum State {
        SKIPPING,
        COLLECTING,
        NGRAM
    }

    public UniversalTokenizer(boolean z) {
        this.maxTokenLength = 256;
        this.eofReached = false;
        this.generateUnigrams = true;
        this.generateNgrams = true;
        this.sendPunct = false;
        this.sendPunct = z;
        this.buffer = new char[this.maxTokenLength];
        this.tokenLength = 0;
        this.state = State.SKIPPING;
        this.queuedTokens = new LinkedList();
        this.pool = new LinkedList();
    }

    public UniversalTokenizer() {
        this(false);
    }

    public static boolean isLetterOrDigit(char c) {
        if (c <= 'z' && c >= 'a') {
            return true;
        }
        if (c <= 'Z' && c >= 'A') {
            return true;
        }
        if (c <= '9' && c >= '0') {
            return true;
        }
        if (c <= '`' || c == 210 || c == 211) {
            return false;
        }
        if (c >= '{' && c <= 127) {
            return false;
        }
        if (c >= 3021 && c <= 3029) {
            return true;
        }
        if (c >= 'A' && c <= 'Z') {
            return true;
        }
        if (c < '0' || c > '9') {
            return Character.isLetterOrDigit(c);
        }
        return true;
    }

    public static boolean isDigit(char c) {
        if (c <= '9' && c >= '0') {
            return true;
        }
        if (c <= 255) {
            return false;
        }
        return Character.isDigit(c);
    }

    public static boolean isWhitespace(char c) {
        if (c == ' ') {
            return true;
        }
        if (c <= '\r' && c >= '\t') {
            return true;
        }
        if (c <= 4 && c >= 1) {
            return true;
        }
        if (c <= 255) {
            return false;
        }
        return Character.isWhitespace(c);
    }

    public static boolean isNgram(char c) {
        if (c > 12290 && c <= 55295) {
            return c < 12352 || c > 12543;
        }
        if (c >= 1536 && c <= 1791) {
            return true;
        }
        if (c >= 63744 && c <= 64255) {
            return true;
        }
        if (c >= 4352 && c <= 4607) {
            return true;
        }
        if (c >= 64336 && c <= 65071) {
            return true;
        }
        if (c >= 65072 && c <= 65103) {
            return true;
        }
        if (c >= 65136 && c <= 65279) {
            return true;
        }
        if (c >= 65376 && c <= 65503) {
            return true;
        }
        if (c >= 3584 && c <= 3711) {
            return true;
        }
        if (c >= 3712 && c <= 3839) {
            return true;
        }
        if (c >= 3840 && c <= 4031) {
            return true;
        }
        if (c >= 2944 && c <= 3071) {
            return true;
        }
        if (c >= 3072 && c <= 3199) {
            return true;
        }
        if (c >= 3200 && c <= 3327) {
            return true;
        }
        if (c < 3328 || c > 3455) {
            return c >= 4256 && c <= 4351;
        }
        return true;
    }

    public boolean isGenerateUnigrams() {
        return this.generateUnigrams;
    }

    public void setGenerateUnigrams(boolean z) {
        this.generateUnigrams = z;
    }

    public boolean isGenerateNgrams() {
        return this.generateNgrams;
    }

    public void setGenerateNgrams(boolean z) {
        this.generateNgrams = z;
    }

    public int getMaxTokenLength() {
        return this.maxTokenLength;
    }

    public void setMaxTokenLength(int i) {
        this.maxTokenLength = i;
    }

    /* renamed from: getProvenance, reason: merged with bridge method [inline-methods] */
    public ConfiguredObjectProvenance m16getProvenance() {
        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public final boolean advance() {
        if (this.cs == null) {
            throw new IllegalStateException("UniversalTokenizer has not been reset.");
        }
        if (this.queuedTokens.size() > 0) {
            handleQueued();
            return true;
        }
        if (this.eofReached) {
            return false;
        }
        while (this.pos < this.cs.length()) {
            this.c = this.cs.charAt(this.pos);
            handleChar();
            this.pos++;
            if (this.queuedTokens.size() > 0) {
                handleQueued();
                return true;
            }
        }
        this.eofReached = true;
        makeTokens();
        if (this.queuedTokens.size() <= 0) {
            return false;
        }
        handleQueued();
        return true;
    }

    private void handleQueued() {
        this.ready = true;
        Range poll = this.queuedTokens.poll();
        this.currToken = new String(poll.buff, 0, poll.len);
        this.startOffset = poll.start;
        this.endOffset = poll.end;
        if (this.firstToken && poll.incr == 0) {
            poll.incr = 1;
            this.firstToken = false;
        }
        this.currType = poll.type;
        this.currPos = poll.incr;
        this.pool.offer(poll);
    }

    protected void handleChar() {
        if ((this.c >= 'a' && this.c <= 'z') || (this.c >= 'A' && this.c <= 'Z')) {
            if (this.state == State.NGRAM) {
                makeTokens();
            }
            addChar();
            this.state = State.COLLECTING;
            return;
        }
        if (this.c == ' ') {
            switch (this.state) {
                case COLLECTING:
                case NGRAM:
                    makeTokens();
                    break;
            }
            sendPunct();
            this.state = State.SKIPPING;
            return;
        }
        if (isNgram(this.c)) {
            switch (this.state) {
                case COLLECTING:
                    makeTokens();
                    this.state = State.NGRAM;
                    break;
                case SKIPPING:
                    this.state = State.NGRAM;
                    break;
            }
            addChar();
            return;
        }
        if (this.c != 0) {
            if (this.state != State.NGRAM || this.c < '\n' || this.c > '\r') {
                if (isWhitespace(this.c)) {
                    switch (this.state) {
                        case COLLECTING:
                        case NGRAM:
                            makeTokens();
                            break;
                    }
                    sendPunct();
                    this.state = State.SKIPPING;
                    return;
                }
                if ((this.c >= '0' && this.c <= '9') || (this.c > 255 && Character.isDigit(this.c))) {
                    switch (this.state) {
                        case NGRAM:
                            makeTokens();
                            this.state = State.COLLECTING;
                            break;
                        case SKIPPING:
                            this.state = State.COLLECTING;
                            break;
                    }
                    addChar();
                    return;
                }
                if (isLetterOrDigit(this.c)) {
                    if (this.state == State.NGRAM) {
                        makeTokens();
                    }
                    addChar();
                    this.state = State.COLLECTING;
                    return;
                }
                if (this.state != State.SKIPPING) {
                    makeTokens();
                }
                sendPunct();
                this.state = State.SKIPPING;
            }
        }
    }

    private void sendPunct() {
        if (!this.sendPunct || isWhitespace(this.c)) {
            return;
        }
        Range range = getRange();
        range.punct(this.c, this.pos);
        this.queuedTokens.add(range);
    }

    protected void addChar() {
        if (this.buffer.length <= this.tokenLength) {
            this.buffer = Arrays.copyOf(this.buffer, this.tokenLength + 32);
        }
        if (this.tokenLength == 0) {
            this.start = this.pos;
        }
        char[] cArr = this.buffer;
        int i = this.tokenLength;
        this.tokenLength = i + 1;
        cArr[i] = this.c;
        if (this.tokenLength >= this.maxTokenLength) {
            makeTokens();
        }
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public int getStart() {
        if (this.ready) {
            return this.startOffset;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public int getEnd() {
        if (this.ready) {
            return this.endOffset;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public String getText() {
        if (this.ready) {
            return this.currToken;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public Token.TokenType getType() {
        if (this.ready) {
            return this.currType;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    public int getPos() {
        return this.currPos;
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    /* renamed from: clone, reason: merged with bridge method [inline-methods] */
    public Tokenizer m15clone() {
        try {
            UniversalTokenizer universalTokenizer = (UniversalTokenizer) super.clone();
            universalTokenizer.buffer = new char[this.maxTokenLength];
            universalTokenizer.tokenLength = 0;
            universalTokenizer.state = State.SKIPPING;
            universalTokenizer.pool = new LinkedList();
            universalTokenizer.queuedTokens = new LinkedList();
            universalTokenizer.currToken = null;
            universalTokenizer.ready = false;
            universalTokenizer.cs = null;
            return universalTokenizer;
        } catch (CloneNotSupportedException e) {
            throw new AssertionError("UniversalTokenizer is Cloneable, but clone call failed");
        }
    }

    @Override // org.tribuo.util.tokens.Tokenizer
    public void reset(CharSequence charSequence) {
        this.cs = charSequence;
        this.pos = 0;
        this.tokenLength = 0;
        this.start = -1;
        this.state = State.SKIPPING;
        this.eofReached = false;
        this.firstToken = true;
        this.c = (char) 0;
        this.startOffset = -1;
        this.endOffset = -1;
        this.currToken = null;
        this.ready = false;
    }

    private Range getRange() {
        return this.pool.isEmpty() ? new Range() : this.pool.remove();
    }

    protected void makeTokens() {
        if (this.tokenLength <= 0) {
            return;
        }
        if (this.state != State.NGRAM) {
            Range range = getRange();
            range.set(this.buffer, this.tokenLength, this.start);
            this.queuedTokens.add(range);
        } else {
            if (this.tokenLength == 1) {
                Range range2 = getRange();
                range2.set(this.buffer[0], this.start);
                this.queuedTokens.add(range2);
                this.tokenLength = 0;
                return;
            }
            for (int i = 0; i < this.tokenLength; i++) {
                if (this.generateUnigrams) {
                    Range range3 = getRange();
                    range3.set(this.buffer[i], this.start + i);
                    this.queuedTokens.add(range3);
                }
                if (this.generateNgrams && i < this.tokenLength - 1) {
                    Range range4 = getRange();
                    range4.set(this.buffer[i], this.buffer[i + 1], this.start + i);
                    this.queuedTokens.add(range4);
                }
            }
        }
        this.tokenLength = 0;
    }
}
