#!/bin/bash

# https://yql.yandex-team.ru/Operations/Xte2emim9YmO8xar9sNLhIZa5iQ9vDYMHCn7EWm2X5g=
# $ yt --proxy hahn read //home/webmaster/users/lester/SS15208/dataset_full_dl --format="<encode_utf8=%false>json" > dataset_full.json

./convert_json_to_csv.py dataset_full.json

head -n1 dataset_full.tsv > dataset_full_header.tsv
cat dataset_full.tsv | grep -e ^unknown > dataset_negative.tsv

cat dataset_full.tsv | grep -e ^escalated > dataset_positive1.tsv
cat dataset_full.tsv | grep -e ^refusal > dataset_positive2.tsv
cat dataset_positive1.tsv dataset_positive2.tsv | shuf > dataset_positive.tsv
rm dataset_positive1.tsv dataset_positive2.tsv

cat dataset_full.tsv | grep -e ^doesntNeedReaction > dataset_semi1.tsv
cat dataset_full.tsv | grep -e ^falseAlarm > dataset_semi2.tsv
cat dataset_semi1.tsv dataset_semi2.tsv | shuf > dataset_semi.tsv
rm dataset_semi1.tsv dataset_semi2.tsv

positive_count=$(cat dataset_positive.tsv | wc -l)
semi_count=$(cat dataset_semi.tsv | wc -l)
shuf dataset_negative.tsv | head -n $(( $positive_count - $semi_count )) > dataset_negative_sample.tsv

# pool for train to detect false positive (dataset_united.tsv)
cat dataset_full_header.tsv > dataset_united.tsv
cat dataset_negative_sample.tsv dataset_positive.tsv dataset_semi.tsv | shuf >> dataset_united.tsv

cat dataset_negative_sample.tsv dataset_positive.tsv | shuf > dataset_wo_semi.tsv
wo_semi_count=$(cat dataset_wo_semi.tsv | wc -l)
train_wo_semi_count=$(( $wo_semi_count - $semi_count ))

# pool for train train on hard positive and test on semi (dataset_train.tsv, dataset_test.tsv)
split -l $train_wo_semi_count dataset_wo_semi.tsv

cat dataset_full_header.tsv > dataset_train.tsv
cat xaa >> dataset_train.tsv

cat dataset_full_header.tsv > dataset_test.tsv
cat xab dataset_semi.tsv | shuf >> dataset_test.tsv

rm xaa xab dataset_wo_semi.tsv

# pool for train to detect semi+positive vs unknown (dataset_equal.tsv)
cat dataset_full_header.tsv > dataset_equal.tsv
cat dataset_full_header.tsv > dataset_equal.tsv
shuf dataset_negative.tsv | head -n $(( $positive_count + $semi_count )) | cat - dataset_positive.tsv dataset_semi.tsv | shuf >> dataset_equal.tsv
