#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import os
import argparse
from nile.api.v1 import (
    clusters,
)
import json


def add_field(dct, name, value):
    dct[name] = value
    return dct


def toloka_process(rec):
    title_computed = '{} - {}'.format(rec['ParentName'], rec['Name'])
    return {
        'title': title_computed,
        'url': rec['vh_url'],
        'video_descr': rec['Comment'],
        'tag': rec['tag'],
        'tag_comment': rec['tag_comment'],
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--source_folder',
        default='//home/video-hosting/detailed_tags/toloka_portion'
    )
    parser.add_argument('--count', type=int, default=5000)
    parser.add_argument('--pool', default=None)
    parser.add_argument('--toloka_input')
    parser.add_argument('--raw_input')
    args = parser.parse_args()

    cluster = clusters.YT(
        proxy=os.environ['YT_PROXY'],
        token=os.environ['YT_TOKEN']
    )
    if args.pool:
        cluster = cluster.update(pool=args.pool)

    count = 0
    recs = []
    for table in cluster.driver.client.search(
        root=args.source_folder,
        node_type='table'
    ):
        row_count = cluster.driver.client.get_attribute(
            table, 'row_count', 0
        )
        if count + row_count <= 5000:
            recs.extend([
                add_field(x.to_dict(), 'table', table)
                for x in cluster.read(table)
            ])
            count += row_count
        else:
            break

    toloka_input = [toloka_process(x) for x in recs]

    json.dump(
        toloka_input, open(args.toloka_input, 'w'),
        indent=2, ensure_ascii=False
    )
    json.dump(
        recs, open(args.raw_input, 'w'),
        indent=2, ensure_ascii=False
    )


if __name__ == "__main__":
    main()
