import json
from collections import defaultdict
from distutils.util import strtobool
import yt.wrapper as yt


VALUABLE_ATTR_NAME = 'ugc_valuable'
VALUABLE_GROUP_ID = 'all_valuable'
NOT_VALUABLE_GROUP_ID = 'not_valuable'
VALUABLE_DEFAULT = False

GROUP_COL = 'category_group'
CATEGORY_COL = 'object_category'

YT_SCHEMA = [
    {'name': GROUP_COL, 'type': 'utf8', 'required': True},
    {'name': CATEGORY_COL, 'type': 'utf8', 'required': True}
]


def extract(category_groups_xml):
    met_ids = defaultdict(set)

    def check_unique_id(tag, id):
        if id in met_ids[tag]:
            raise RuntimeError('Duplicate {} id "{}"'.format(tag, id))
        met_ids[tag].add(id)

    def read_entity(elem, valuable_default):
        check_unique_id(elem.tag, elem.get('id'))
        valuable = strtobool(str(elem.get(VALUABLE_ATTR_NAME, valuable_default)))
        return elem.get('id'), valuable

    for group in category_groups_xml.xpath('/category-groups/category-group'):
        group_id, group_valuable = read_entity(group, VALUABLE_DEFAULT)
        for category in group.xpath('categories/category'):
            category_id, category_valuable = read_entity(category, group_valuable)
            category_ugc_groups = [VALUABLE_GROUP_ID, group_id] if category_valuable else [NOT_VALUABLE_GROUP_ID]
            for ugc_group in category_ugc_groups:
                yield {GROUP_COL: ugc_group, CATEGORY_COL: category_id}


def upload(json_rows, yt_client, yt_path, yt_schema=YT_SCHEMA):
    table = yt.TablePath(yt_path, attributes={'schema': yt_schema})
    yt_client.write_table(table, json_rows, format='<encode_utf8=%false>json')


def print_json(json_rows):
    for json_row in json_rows:
        print(json.dumps(json_row))
