#!/usr/bin/python
# -*- coding: utf-8 -*-

#разделение полной выборки на обучающую и тестовую

import sys
import re
import yt.wrapper as yt


def divide(key, recs):
    title = ''
    for rec in recs:
        table_index = rec.pop('@table_index')
        if table_index == 0:
            title = rec['title']
        elif title != '': #test
            rec['@table_index'] = 0
            yield rec
        else: #train
            rec['@table_index'] = 1
            yield rec


def main():
    tab1 = '//home/catalogia/users/kuzinaksu/set_for_markup_20000'
    tab2 = '//tmp/yuryz/set_for_markup_20000'
    yt.run_sort(tab1, tab2, sort_by=['title', 'body'])

    tab3 = '//home/catalogia/users/yuryz/contest/train_full'
    tab4 = '//tmp/yuryz/train_full'
    yt.run_sort(tab3, tab4, sort_by=['title', 'body'])

    tab5 = '//home/catalogia/users/yuryz/contest/test'
    tab6 = '//home/catalogia/users/yuryz/contest/train'

    yt.run_reduce(divide, [tab2, tab4], [tab5, tab6], reduce_by = ['title', 'body'], format=yt.YsonFormat(control_attributes_mode="row_fields"))
    yt.run_sort(tab5, sort_by=['bid'])
    yt.run_sort(tab6, sort_by=['bid'])

    print >> sys.stderr, yt.row_count(tab4)
    print >> sys.stderr, yt.row_count(tab5)
    print >> sys.stderr, yt.row_count(tab6)


if __name__ == '__main__':
    main()
