#!/bin/sh

NOW=`date +'%Y_%m_%d_%Hh%Mm'`

THIS=`readlink -f $0`
DICODE_ANALYSIS_HOME=`dirname ${THIS}`

DATA_DIR="/usr/lib/dicode/deploy-analysis/data"
HDFS_WORKING_DIR="/user/dicode/news/collocation/${NOW}"

HBASE_TABLE="news_content"
COLUMN_FAMILY=doc
COLUMN=text

${DICODE_ANALYSIS_HOME}/dicode-analysis eu.dicodeproject.analysis.hbase.HBaseLuceneTokenizerDriver -a eu.dicodeproject.analysis.lucene.CleansingAnalyzer -o ${HDFS_WORKING_DIR}/ -t ${HBASE_TABLE} -f ${COLUMN_FAMILY} -c ${COLUMN}
${DICODE_ANALYSIS_HOME}/dicode-analysis org.apache.mahout.vectorizer.collocations.llr.CollocDriver --input ${HDFS_WORKING_DIR}/tokenized-documents/ --output ${HDFS_WORKING_DIR}/collocation/ --maxNGramSize 3