/*
 * Decompiled with CFR 0.152.
 */
package com.apple.foundationdb.record.lucene;

import com.apple.foundationdb.record.lucene.AlphanumericCjkAnalyzer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

class AlphanumericCjkAnalyzerTest {
    AlphanumericCjkAnalyzerTest() {
    }

    @Test
    void verifyTextWithMixedCharactersIsTokenizedAsExpected() throws IOException {
        AlphanumericCjkAnalyzer analyzer = new AlphanumericCjkAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, 3, 30, null);
        String input = "water\u6c34\u6c34\ubb3c-of\u7684\u306e\uc758\u3002house\u5c4b\u5bb6\uc9d1\nyou\u4f60\u541b\ub108";
        List<String> text = this.readTokenizedText((Analyzer)analyzer, input);
        Assertions.assertIterableEquals(List.of("water", "\u6c34", "\u6c34", "\ubb3c", "\u7684", "\u306e", "\uc758", "house", "\u5c4b", "\u5bb6", "\uc9d1", "you", "\u4f60", "\u541b", "\ub108"), text, (String)"Incorrect tokenized string!");
    }

    @Test
    void verifyAlphanumericMinTokenLengthIsRespected() throws IOException {
        AlphanumericCjkAnalyzer analyzer = new AlphanumericCjkAnalyzer(CharArraySet.EMPTY_SET, 4, 6, null);
        String input = "\u5f53when\u5168all \u884c can \u6bcfeach\u3002\u9192-wake";
        List<String> text = this.readTokenizedText((Analyzer)analyzer, input);
        Assertions.assertIterableEquals(List.of("\u5f53", "when", "\u5168", "\u884c", "\u6bcf", "each", "\u9192", "wake"), text, (String)"Incorrect tokenized string!");
    }

    @Test
    void verifyAlphanumericMaxTokenLengthIsRespected() throws IOException {
        AlphanumericCjkAnalyzer analyzer = new AlphanumericCjkAnalyzer(CharArraySet.EMPTY_SET, 1, 3, false, null);
        String input = "\u5f53when\u5168all \u884c can \u6bcfeach\u3002\u9192-wake\uff0c\u6211I\n\u4e4bof";
        List<String> text = this.readTokenizedText((Analyzer)analyzer, input);
        Assertions.assertIterableEquals(List.of("\u5f53", "\u5168", "all", "\u884c", "can", "\u6bcf", "\u9192", "\u6211", "i", "\u4e4b", "of"), text, (String)"Incorrect tokenized string!");
    }

    @Test
    void breaksLongTokensApart() throws IOException {
        AlphanumericCjkAnalyzer analyzer = new AlphanumericCjkAnalyzer(CharArraySet.EMPTY_SET, 1, 3, true, null);
        String input = "\u5f53when\u5168all \u884c can \u6bcfeach\u3002\u9192-wake\uff0c\u6211I\n\u4e4bof";
        List<String> text = this.readTokenizedText((Analyzer)analyzer, input);
        Assertions.assertIterableEquals(List.of("\u5f53", "whe", "n", "\u5168", "all", "\u884c", "can", "\u6bcf", "eac", "h", "\u9192", "wak", "e", "\u6211", "i", "\u4e4b", "of"), text, (String)"Incorrect tokenized string!");
    }

    @Test
    void verifyStopwordsAreExcluded() throws IOException {
        AlphanumericCjkAnalyzer analyzer = new AlphanumericCjkAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
        String input = "\u5f53of\u5168all \u884c can \u6bcfeach\u3002\u9192-wake";
        List<String> text = this.readTokenizedText((Analyzer)analyzer, input);
        Assertions.assertIterableEquals(List.of("\u5f53", "\u5168", "all", "\u884c", "can", "\u6bcf", "each", "\u9192", "wake"), text, (String)"Incorrect tokenized String!");
    }

    private List<String> readTokenizedText(Analyzer analyzer, String input) throws IOException {
        TokenStream tokenizer = analyzer.tokenStream("text", input);
        CharTermAttribute termAttr = (CharTermAttribute)tokenizer.addAttribute(CharTermAttribute.class);
        tokenizer.reset();
        ArrayList<String> terms = new ArrayList<String>();
        while (tokenizer.incrementToken()) {
            terms.add(termAttr.toString());
        }
        return terms;
    }
}

