#April 7, 2017

if [ $# -ne 3 ]; then
  echo "The number of parameter is: $#" 1>&2
  echo "Need to type 3 parameters: 
	1.folder directory: word_data or char_data
	2.data type: celltype or antigen
	3.'n' of ngram: from 1 to 10 " 1>&2
  exit 1
fi

dir=$1
type=$2
gram_n=$3
echo "$gram_n"
#Seperate each file into three files involving label 1, label 2, text features
#'cX' in filename means extracting items whose counts are larger than X in count.py

python3 count.py ${dir}/human_${type}.tsv 10 > ${dir}/c10_human_${type}.txt
cut -f1 ${dir}/c10_human_${type}.txt > ${dir}/c10_human_${type}_l1.txt
cut -f2 ${dir}/c10_human_${type}.txt > ${dir}/c10_human_${type}_l2.txt
cut -f3- ${dir}/c10_human_${type}.txt > ${dir}/c10_human_${type}_feature.txt

#Transform non-numerical labels to integer labels
python3 text-ngram.py $dir $type $gram_n ${dir}/c10_human_${type}_l1.txt ${dir}/c10_human_${type}_intl1.txt
python3 text-ngram.py $dir $type $gram_n ${dir}/c10_human_${type}_l2.txt ${dir}/c10_human_${type}_intl2.txt

#Combine label 1, label 2 and text features together again.
paste ${dir}/c10_human_${type}_intl1.txt ${dir}/c10_human_${type}_intl2.txt ${dir}/c10_human_${type}_feature.txt > ${dir}/c10_human_${type}_intl12_feature.txt

#Shuffle the combined file including label1, label2 and text features.
gshuf ${dir}/c10_human_${type}_intl12_feature.txt > ${dir}/shuf_c10_human_${type}_intl12_feature.txt

#Separate the samples into 10-fold cross validation datasets
file_size=$(cat ${dir}/shuf_c10_human_${type}_intl12_feature.txt | wc -l)
test_size=$(expr ${file_size} / 10)
echo "$test_size"

for i in {1..10}
do
	awk -v var="$i" -v size="$test_size" 'NR>(var-1)*size && NR<=var*size' ${dir}/shuf_c10_human_${type}_intl12_feature.txt > ${dir}/c10_test${i}_human_${type}_intl12_feature.txt
	awk -v var="$i" -v size="$test_size" 'NR<=(var-1)*size || NR>var*size' ${dir}/shuf_c10_human_${type}_intl12_feature.txt > ${dir}/c10_train${i}_human_${type}_intl12_feature.txt
done

#Separate each training file and test file into three files involving label 1, label 2, text features, separately.
for i in {1..10}
do
	cut -f1 $dir/c10_train${i}_human_${type}_intl12_feature.txt > $dir/c10_train${i}_human_${type}_intl1.txt
	cut -f2 $dir/c10_train${i}_human_${type}_intl12_feature.txt > $dir/c10_train${i}_human_${type}_intl2.txt
	cut -f3- $dir/c10_train${i}_human_${type}_intl12_feature.txt > $dir/c10_train${i}_human_${type}_feature.txt
	cut -f1 $dir/c10_test${i}_human_${type}_intl12_feature.txt > $dir/c10_test${i}_human_${type}_intl1.txt
	cut -f2 $dir/c10_test${i}_human_${type}_intl12_feature.txt > $dir/c10_test${i}_human_${type}_intl2.txt
	cut -f3- $dir/c10_test${i}_human_${type}_intl12_feature.txt > $dir/c10_test${i}_human_${type}_feature.txt
done