#!/bin/sh
# Deprecated: we use an oozie job instead
#
# reads tweets from hbase and writes vectors to hdfs
# TODO: add stopword list via custom analyser ?!
# see: http://comments.gmane.org/gmane.comp.apache.mahout.user/6255

# used for 
TOPIC=$1
HBASE_INPUT=$2

THIS=`readlink -f $0`
DICODE_ANALYSIS_HOME=`dirname ${THIS}`

NOW=`date +'%Y_%m_%d_%Hh%Mm'`

HDFS_WORKING_DIR="/user/dicode/vectorizer/${TOPIC}/${NOW}"
EXPORT_DIR="${HDFS_WORKING_DIR}/twitterexport"
VECTOR_DIR="${HDFS_WORKING_DIR}/sparsevectors"
LDA_DIR="${HDFS_WORKING_DIR}/lda"

${DICODE_ANALYSIS_HOME}/dicode-analysis eu.dicodeproject.analysis.export.TwitterExportDriver \
        -o ${EXPORT_DIR} \
        -t ${HBASE_INPUT} \
        -q ${TOPIC}

# no normalization - we want to get absolute word counts
# TODO: minDF maxDFPercent are not needed with tf 
${DICODE_ANALYSIS_HOME}/dicode-analysis org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles \
        -i ${EXPORT_DIR} \
        -o ${VECTOR_DIR} \
        -wt tf -seq \
        --minSupport 3 \
        --minDF 5 \
        --maxDFPercent 30 \
        --numReducers 10

# detect topics with lda

#${DICODE_ANALYSIS_HOME}/dicode-analysis org.apache.mahout.clustering.lda.LDADriver -i "${VECTOR_DIR}/tf-vectors" -o ${LDA_DIR} -k 50 -v 1000000 -x 100 -Dmapred.max.split.size=2000000

# print lda topics: TODO: insert model name state-<NN>
#${DICODE_ANALYSIS_HOME}/dicode-analysis org.apache.mahout.clustering.lda.LDAPrintTopics -i ${HDFS_WORKING_DIR}/lda/state-<NN> -d ${EXPORT_DIR}/dictionary.file-0 -dt sequencefile -o ${HDFS_WORKING_DIR}/topics
# for example:
#  ./mahout ldatopics -i vectorizer/saitama/2011_07_12_15h06m/lda/state-99 -d vectorizer/saitama/2011_07_12_15h06m/sparsevectors/dictionary.file-0 -dt sequencefile -o saitama_topics
# attention: document/topic mappings are also found in the lda directory

