import datetime
import sys
import csv

from yt.wrapper.client import Yt
from webmaster3.cass import Cass, Keyspaces
from webmaster3.task import Task

def run(cluster):
    client = Yt(proxy='banach.yt.yandex.net', token=None)
    session = Cass.cluster(cluster).connect()
    session.set_keyspace(Keyspaces.WEBMASTER3)
    rows = session.execute("SELECT * FROM host_display_name_moderation")

    canceled_requests = {}
    collect_canceled(rows, canceled_requests)
    print "Total canceled requests: %d" % len(canceled_requests)

    rows = session.execute("SELECT * FROM host_display_name_moderation")
    find_initial_request_for_canceled(rows, canceled_requests)

    from os.path import expanduser
    home_path = expanduser("~")

    with open(home_path + '/host_display_name_canceled.csv', "w") as f:
        writes = csv.writer(f, delimiter=',', quoting=csv.QUOTE_ALL)
        writes.writerows(rows_generator(canceled_requests))

    print "Done."


def collect_canceled(rows, out):
    row_count = 0
    for row in rows:
        row_count += 1
        if row_count % 10000 == 0:
            print "Rows: " + str(row_count)

        state = int(row.state)
        if state == 5 and row.modification_date.year == 2017:
            d = {}
            for p, v in vars(row).iteritems():
                d[p] = v

            d['created_date'] = datetime.datetime.fromtimestamp(0)
            out[row.host_id] = d


def find_initial_request_for_canceled(rows, canceled_requests):
    row_count = 0

    for row in rows:
        row_count += 1
        if row_count % 10000 == 0:
            print "Rows: " + str(row_count)

        state = int(row.state)
        if state == 1 and row.host_id in canceled_requests:
            canceled_req = canceled_requests[row.host_id]
            if canceled_req['modification_date'] > row.modification_date > canceled_req['created_date']:
                canceled_req['created_date'] = row.modification_date


def rows_generator(canceled_requests):
    for canceled_req in canceled_requests.itervalues():
        host_id = canceled_req['host_id']
        created_date = canceled_req['created_date']
        modification_date = canceled_req['modification_date']
        d = modification_date - created_date
        yield  host_id, int(d.total_seconds() / 3600)


Task.run(run, "prod")
