#!/bin/bash

function generate {
    pool=all_$1.txt
    train=train_$1.txt
    test=test_$1.txt
    yt --proxy hahn read //home/webmaster/prod/cms/datasets/train/$1 --format "<columns=[Target;Tokens]>schemaful_dsv" > $pool
    pool_size=$(cat $pool | wc -l)
    train_size=$(python -c "print(int($pool_size * 0.8))")
    shuf $pool | split -l $train_size
    mv xaa $train
    mv xab $test
    head $train > sm_$train
    head $test > sm_$test
}

generate bitrix
generate dle
generate drupal
generate insales
generate instant
generate joomla
generate opencart
generate webasyst
generate wordpress

cat all_*.txt > tokens.txt
