#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import json
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime
import requests
import time
import copy
import re
import gzip
import StringIO
from collections import defaultdict


re_scheme = re.compile(r'^https?://(www\.)?')


def proc_url(url):
    return re_scheme.sub("", url)


def get_serps(id_, headers=None):
    retries = 0
    j = None
    while not j and retries <= 3:
        print('getting serpset {}'.format(id_))
        try:
            j = requests.get(
                'https://metrics-calculation.qloud.yandex-team.ru/api/json/'
                '{}?regional=RU&evaluation=VIDEO'
                '&absolute=false&serpset-filter=onlySearchResult'.format(id_),
                verify=False, headers=headers
            )
        except Exception as e:
            print('oops: {}'.format(e))
            retries += 1
            time.sleep(5)
    return json.load(gzip.GzipFile(fileobj=StringIO.StringIO(j.content)))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output')
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--debug_ts')
    parser.add_argument('--yql_query')
    parser.add_argument('--serpset_id')
    args = parser.parse_args()

    if os.path.isfile(args.yql_query):
        with codecs.open(args.yql_query, 'r', 'utf8') as f:
            args.yql_query = f.read().strip()

    yql_headers = {
        'Content-Type': 'application/json',
        'Authorization': 'OAuth {}'.format(os.environ['YQL_TOKEN'])
    }
    metrics_headers = {
        'Authorization': 'OAuth {}'.format(os.environ['METRICS_TOKEN'])
    }
    if args.debug_ts:
        ts = args.debug_ts
    else:
        ts = datetime.datetime.now().strftime('%s')
    work_dir = '//home/videolog/mma-1514/{}'.format(ts)
    urls_merged_table = '{}/merged'.format(work_dir)
    # req = requests.post(
    #     'https://yql.yandex.net/api/v2/operations',
    #     json={
    #         'content': args.yql_query.format(urls_merged_table),
    #         'action': 'RUN',
    #         'type': 'SQL'
    #     },
    #     headers=yql_headers
    # )
    # id_ = req.json()['id']
    # status = req.json()['status']
    # tries = 0
    # while status in {'PENDING', 'RUNNING'} and tries < 5:
    #     req = requests.get(
    #         'https://yql.yandex.net/api/v2/operations/{}'.format(id_),
    #         headers=yql_headers
    #     )
    #     status = req.json()['status']
    #     time.sleep(5 * 60)
    #     tries += 1
    # if status != 'COMPLETED':
    #     sys.stderr.write('operation {} failed: {}'.format(id_, req.content))

    hahn = clusters.yt.Hahn(token=os.environ['YT_TOKEN'])

    join_table = '{}/content_for_join'.format(work_dir)
    result_table = '{}/content_joined'.format(work_dir)

    serpset = get_serps(args.serpset_id, headers=metrics_headers)

    all_urls = []
    for serp in serpset:
        urls = []
        if 'components' not in serp:
            continue
        for x in serp['components']:
            try:
                url = x['componentUrl']['pageUrl']
            except KeyError:
                continue
            if len(urls) >= 10:
                continue
            urls.append(url)
            all_urls.append(proc_url(url))

    hahn.write(
        join_table,
        [Record(Url=proc_url(x)) for x in all_urls]
    )

    job = hahn.job()

    job.table(urls_merged_table).join(
        job.table(join_table), type='inner', by='Url'
    ).sort('Url').put(
        result_table
    )

    job.run()

    good_urls = set()
    for rec in hahn.read(result_table):
        good_urls.add(rec.Url)

    output_ = []
    for serp in serpset:
        query = serp['query']['text']
        if 'components' not in serp:
            continue
        for x in serp['components']:
            url = x['componentUrl']['pageUrl']
            is_good = int(proc_url(url) in good_urls)
            output_.append(
                {
                    'Query': query, 'Url': url, 'IsGood': is_good
                }
            )
    metric = sum([x['IsGood'] for x in output_]) / float(len(output_))
    output_final = {
        'metric': metric, 'detailed': output_
    }

    if not args.debug:
        hahn.driver.client.remove(work_dir, recursive=True)

    json.dump(
        output_final, open(args.output, 'w'), indent=2, sort_keys=True
    )


if __name__ == "__main__":
    main()
