/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.vectorizer.encoders;

import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.apache.mahout.math.Vector;
import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;

public class TextValueEncoder
extends FeatureVectorEncoder {
    private static final double LOG_2 = Math.log(2.0);
    private final Splitter onNonWord = Splitter.on((Pattern)Pattern.compile("\\W+")).omitEmptyStrings();
    private FeatureVectorEncoder wordEncoder;
    private final Multiset<String> counts;

    public TextValueEncoder(String name) {
        super(name, 2);
        this.wordEncoder = new StaticWordValueEncoder(name);
        this.counts = HashMultiset.create();
    }

    @Override
    public void addToVector(byte[] originalForm, double weight, Vector data) {
        this.addText(originalForm);
        this.flush(weight, data);
    }

    public void addText(byte[] originalForm) {
        this.addText(new String(originalForm, Charsets.UTF_8));
    }

    public void addText(CharSequence text) {
        for (String word : this.tokenize(text)) {
            this.counts.add((Object)word);
        }
    }

    public void flush(double weight, Vector data) {
        for (String word : this.counts.elementSet()) {
            this.wordEncoder.addToVector(word, weight * Math.log(1 + this.counts.count((Object)word)) / LOG_2, data);
        }
        this.counts.clear();
    }

    @Override
    protected int hashForProbe(byte[] originalForm, int dataSize, String name, int probe) {
        return 0;
    }

    @Override
    protected Iterable<Integer> hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) {
        ArrayList<Integer> hashes = new ArrayList<Integer>();
        for (String word : this.tokenize(new String(originalForm, Charsets.UTF_8))) {
            hashes.add(this.hashForProbe(this.bytesForString(word), dataSize, name, probe));
        }
        return hashes;
    }

    protected Iterable<String> tokenize(CharSequence originalForm) {
        return this.onNonWord.split(originalForm);
    }

    @Override
    public String asString(String originalForm) {
        StringBuilder r = new StringBuilder();
        r.append('[');
        for (String word : this.tokenize(originalForm)) {
            if (r.length() > 1) {
                r.append(", ");
            }
            r.append(this.wordEncoder.asString(word));
        }
        r.append(']');
        return r.toString();
    }

    public final void setWordEncoder(FeatureVectorEncoder wordEncoder) {
        this.wordEncoder = wordEncoder;
    }
}

