package org.tribuo.util.tokens.impl.wordpiece;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.util.IOUtil;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/* loaded from: input_file:org/tribuo/util/tokens/impl/wordpiece/Wordpiece.class */
public class Wordpiece implements Configurable {
    public static final String DEFAULT_UNKNOWN_TOKEN = "[UNK]";

    @Config(mandatory = true, description = "path to a vocabulary data file.")
    private String vocabPath;

    @Config(mandatory = false, description = "the value to use for 'UNKNOWN' tokens. Defaults to '[UNK]' which is a common default in BERT-based solutions.")
    private String unknownToken;

    @Config(mandatory = false, description = "the maximum number of characters per word to consider. This helps eliminate doing extra work on pathological cases.")
    private int maxInputCharactersPerWord;
    private Set<String> vocab;

    private Wordpiece() {
        this.unknownToken = DEFAULT_UNKNOWN_TOKEN;
        this.maxInputCharactersPerWord = 100;
    }

    public Wordpiece(Set<String> set) {
        this(set, DEFAULT_UNKNOWN_TOKEN);
    }

    public Wordpiece(Set<String> set, String str) {
        this(set, str, 100);
    }

    public Wordpiece(Set<String> set, String str, int i) {
        this.unknownToken = DEFAULT_UNKNOWN_TOKEN;
        this.maxInputCharactersPerWord = 100;
        this.vocab = Collections.unmodifiableSet(set);
        this.unknownToken = str;
        this.maxInputCharactersPerWord = i;
    }

    public Wordpiece(String str) {
        this.unknownToken = DEFAULT_UNKNOWN_TOKEN;
        this.maxInputCharactersPerWord = 100;
        this.vocabPath = str;
        try {
            postConfig();
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    public Wordpiece(String str, String str2, int i) {
        this.unknownToken = DEFAULT_UNKNOWN_TOKEN;
        this.maxInputCharactersPerWord = 100;
        this.vocabPath = str;
        this.unknownToken = str2;
        this.maxInputCharactersPerWord = i;
        try {
            postConfig();
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    public void postConfig() throws IOException {
        this.vocab = Collections.unmodifiableSet(new HashSet(IOUtil.getLines(this.vocabPath)));
    }

    public List<String> wordpiece(String str) {
        if (str.length() > this.maxInputCharactersPerWord) {
            return Collections.singletonList(this.unknownToken);
        }
        ArrayList arrayList = new ArrayList();
        boolean z = false;
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= str.length()) {
                break;
            }
            int length = str.length();
            String str2 = null;
            while (true) {
                if (i2 >= length) {
                    break;
                }
                String substring = str.substring(i2, length);
                if (i2 > 0) {
                    substring = "##" + substring;
                }
                if (this.vocab.contains(substring)) {
                    str2 = substring;
                    break;
                }
                length--;
            }
            if (str2 == null) {
                z = true;
                break;
            }
            arrayList.add(str2);
            i = length;
        }
        return z ? Collections.singletonList(this.unknownToken) : arrayList;
    }

    public String getUnknownToken() {
        return this.unknownToken;
    }

    public int getMaxInputCharactersPerWord() {
        return this.maxInputCharactersPerWord;
    }
}
