#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import json
import tarfile


def process_serp(obj, type_='video'):
    result = []
    for el in obj['components']:
        new = {}
        if type_ == 'video':
            new['url'] = el['componentUrl']['pageUrl']
            new['urlsfresh'] = el['judgements.video_freshness']['name']
            new['relevance'] = el['judgements.video_relevance']['name']
            new['duration'] = el.get('text.videoDuration', '0:00')
            new['is_playable'] = 1
        elif type_ == 'images':
            new['url'] = el['imageadd']['url']
            new['page-url'] = el['componentUrl']['pageUrl']
            if 'text.SERVER_DESCR' in el:
                new['source'] = el['text.SERVER_DESCR']
            try:
                new['urlsfresh'] = el['judgements.image_freshness']['name']
            except KeyError:
                new['urlsfresh'] = 'NOT_FRESH'

            try:
                new['relevance'] = el['judgements.image_fresh_relevance']['name']
            except:
                continue

            visual_quality = el.get('judgements.images_vq3_v2', {}).get('value', -10.0)
            if visual_quality > -1:
                new['visual_quality'] = visual_quality

            dimensions = el.get('dimension.IMAGE_DIMENSION', {})
            if dimensions and 'w' in dimensions and 'h' in dimensions:
                new['width'] = dimensions['w']
                new['height'] = dimensions['h']

            grayscale = el.get('judgements.avatars_avg_gray_deviation', {}).get('name', 'NOT_JUDGED')
            if grayscale != 'NOT_JUDGED':
                new['gray_deviation'] = float(grayscale)

            kernel_str = el.get('judgements.proxima_business_kernel', {}).get('name', '{}')
            kernel = json.loads(kernel_str)
            if kernel and 'biz_kernel_quantile' in kernel:
                new['kernel'] = kernel['biz_kernel_quantile']

        result.append(new)
    return result


def process_serpset(obj, args):
    result = []

    for x in obj:
        additional = json.loads(x['text.additional'])
        new_res = {
            'query': x['query']['text'],
            'region_id': x['query']['regionId'],
            'serp': process_serp(x, type_=args.type),
            'country': x['query']['country'],
        }
        new_res.update(additional)
        result.append(new_res)
    return result


def collect_jsons_from_targz(filename, serpsets_info=None):
    with tarfile.open(filename, 'r:gz') as tf:
        filenames = [
            x.name for x in tf.getmembers()
        ]
        if serpsets_info:
            mapping = {
                str(x['id']): x['cronDownloadId']
                for x in json.load(open(serpsets_info))
            }
            filenames = sorted(
                filenames,
                key=lambda x: mapping[x.split('.')[0]]
            )
        tf.extractall()
    return [
        json.load(open(x)) for x in filenames
    ]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument('output')
    parser.add_argument('output_non_flattened')
    parser.add_argument('--input_type', default='json')
    parser.add_argument('--type', default='video')
    parser.add_argument('--serpsets_info')
    args = parser.parse_args()

    if args.input_type == 'json':
        obj = json.load(open(args.input))
    elif args.input_type == 'tar.gz':
        obj = collect_jsons_from_targz(
            args.input, serpsets_info=args.serpsets_info
        )

    if isinstance(obj[0], list):
        result = []
        result_non_flattened = []
        for x in obj:
            processed = process_serpset(x, args)
            result.extend(processed)
            result_non_flattened.append(processed)

    else:
        result = process_serpset(obj, args)
        result_non_flattened = [result]

    json.dump(result, open(args.output, 'w'), indent=2, sort_keys=True)
    json.dump(
        result_non_flattened,
        open(args.output_non_flattened, 'w'), indent=2, sort_keys=True
    )



if __name__ == "__main__":
    main()
