#!/bin/sh
TOPIC=$1
NOW=`date +'%Y_%m_%d_%Hh%Mm'`

DICODE_ANALYSIS_HOME=`dirname \`readlink -f $0\` `

DATA_DIR="${DICODE_ANALYSIS_HOME}/data"
HDFS_WORKING_DIR="/user/dicode/twitter/collocation/${TOPIC}/${NOW}"
HBASE_TABLE="${TOPIC}"
${DICODE_ANALYSIS_HOME}/dicode-analysis eu.dicodeproject.analysis.hbase.HBaseLuceneTokenizerDriver -a eu.dicodeproject.analysis.lucene.CleansingAnalyzer -o ${HDFS_WORKING_DIR}/ -t ${HBASE_TABLE} -f d
${DICODE_ANALYSIS_HOME}/dicode-analysis org.apache.mahout.vectorizer.collocations.llr.CollocDriver --input ${HDFS_WORKING_DIR}/tokenized-documents/ --output ${HDFS_WORKING_DIR}/collocation/

NGRAM_DIR=${HDFS_WORKING_DIR}/collocation/ngrams

TEMPFILE=`tempfile --prefix=twico`
hadoop fs -ls ${NGRAM_DIR} | grep ${NGRAM_DIR}/part | sed -e 's!.* \([^ ]*/part.*\)$!\1!g' | xargs -l1 -I {} ${DICODE_ANALYSIS_HOME}/dicode-analysis seqdumper -s {} | grep ^Key\: >> $TEMPFILE

CURRENT_DATA_DIR=${DATA_DIR}/${TOPIC}/${NOW}
mkdir -p ${CURRENT_DATA_DIR}
sort -nr -k5 $TEMPFILE  > ${CURRENT_DATA_DIR}/${TOPIC}_${NOW}

rm $TEMPFILE