# coding=utf-8
import travel.avia.admin.init_project  # noqa

import logging
import os
import time
from StringIO import StringIO
from argparse import ArgumentParser
from ciso8601 import parse_datetime
from datetime import datetime, timedelta
from itertools import izip

import requests
from django.conf import settings
from lxml import etree
from yt.wrapper import TablePath, JsonFormat

from travel.avia.admin.lib import scraper
from travel.avia.admin.lib.logs import create_current_file_run_log, add_stdout_handler
from travel.avia.admin.lib.scraper import log as scraper_log
from travel.avia.admin.lib.yt_helpers import yt_client_fabric, safe_tables_for_daterange

WIZARD_URL = 'http://wizard.avia.yandex.net/api/wizard/avia/'
YT_FLIGHTS_PATH = '//home/avia/stats/most_popular_flights'
YT_STATS_PATH = '//home/avia/stats/compare_flight_wizards'
SCRAPER_TOKEN = os.getenv('AVIA_SCRAPER_TOKEN')
ROBOT_NAME = 'robot-avia'
QUOTA_ID = 'avia-common'
FLIGHTS_NUMBER = 3000
BATCH_SIZE = 100
ALLOWED_ENVS = ['production', 'dev']
QUERIES_REGION = 213  # MOSCOW

log = logging.getLogger(__name__)
create_current_file_run_log()
TODAY = datetime.utcnow().date()


def obtain_most_popular_flights(table):
    yt_client = yt_client_fabric.create()
    return [(row['flight_number'], row['cnt']) for row in yt_client.read_table(table, format=JsonFormat())]


def load_most_popular_queries():
    yt_client = yt_client_fabric.create()
    table_path = yt_client.TablePath(YT_FLIGHTS_PATH)
    tables_for_last_days = safe_tables_for_daterange(yt_client, table_path, TODAY - timedelta(days=3), TODAY)
    if not tables_for_last_days:
        raise Exception('Could not find most popular flight numbers table for the last 3 days.')
    return obtain_most_popular_flights(tables_for_last_days[-1])[:FLIGHTS_NUMBER]


def save_stats(stats):
    yt_client = yt_client_fabric.create()
    yt_client.write_table(TablePath.join(yt_client.TablePath(YT_STATS_PATH), TODAY.strftime('%Y-%m-%d')), [stats])


def flights_from_queries(queries):
    return [flight for (flight, cnt) in queries]


def get_html_tree(html):
    return etree.parse(StringIO(html), etree.HTMLParser())


def value_containing_alphanum_or_none(string):
    return string if any(c.isalnum() for c in string) else None


def parse_google_wizard(tree):
    expected_attributes = [
        'data-terminals',
        'data-departure_time',
        'data-arrival_time',
        'data-departure_delay',
        'data-arrival_delay',
        'data-is_schedule_only',
    ]
    attrs = [div.attrib for div in tree.findall('.//div')
                  if all(expected_attr in div.attrib for expected_attr in expected_attributes)]
    if not attrs:
        return None
    flight_infos = []
    for google_attrs in attrs:
        data_terminals = google_attrs['data-terminals'].replace('/', ' ').split()  # format is 'D/16 1/?'
        if len(data_terminals) != 4:
            log.error('Expected exactly 4 parts of data-terminals.'
                      'Data terminals: %s', google_attrs['data-terminals'])
        departure_delay = int(google_attrs['data-departure_delay'])
        arrival_delay = int(google_attrs['data-arrival_delay'])
        departure_actual_time = parse_datetime(google_attrs['data-departure_time'])
        arrival_actual_time = parse_datetime(google_attrs['data-arrival_time'])
        flight_infos.append({
            'departure_gate': value_containing_alphanum_or_none(data_terminals[1]),
            'departure_time_scheduled': departure_actual_time - timedelta(minutes=departure_delay),
            'arrival_time_scheduled': arrival_actual_time - timedelta(minutes=arrival_delay),
            'arrival_delay': arrival_delay,
            'departure_delay': departure_delay,
            'arrival_gate': value_containing_alphanum_or_none(data_terminals[3]),
            'arrival_terminal': value_containing_alphanum_or_none(data_terminals[2]),
            'departure_terminal': value_containing_alphanum_or_none(data_terminals[0]),
            'status': ('NO_INFO' if google_attrs['data-is_schedule_only'] == 'y' else 'SOME_INFO'),
        })
    return flight_infos


def get_scraper_configured_for(search_engine, flags=None):
    configuration = (scraper
                     .Configure()
                     .SetRegion(QUERIES_REGION)
                     .SetQuota(QUOTA_ID)
                     .SetMeta(token=SCRAPER_TOKEN, creator='ismirn0ff',
                              invoker=ROBOT_NAME, name='compare flight wizards'))
    if flags:
        for (key, value) in flags.items():
            configuration = configuration.AddCgiParameter(key, value)
    configure = {
        'google': lambda config: config.ForGoogle(),
        'yandex': lambda config: config.ForYandex(),
    }
    return configure[search_engine](configuration).BuildScraper()


def get_serp_lines(serp):
    try:
        return serp['serp-page']['parser-results']['components']
    except Exception:
        return None


def parse_serps(serps, expected_length):
    serps = [
        {
            'id': serp['serp-request-explained']['serp-request-id'],
            'result_lines': get_serp_lines(serp)
        }
        for serp in serps if serp['status']['status'] != 'fail'
    ]
    serps.sort(key=lambda x: int(x['id']))
    indexes = map(lambda serp: int(serp['id']), serps)
    if indexes != range(1, expected_length + 1):
        raise Exception('Scraper should return exactly one response per query.'
                        'Returned responses for queries %s', ', '.join(map(str, indexes)))
    return serps


def get_ticket_result(search_engine, ticket, expected_result_length):
    configured = get_scraper_configured_for(search_engine)
    return parse_serps(configured.proxy.GetResult(ticket), expected_result_length)


def await_ticket(search_engine, ticket):
    configured = get_scraper_configured_for(search_engine)
    while not configured.proxy.IsCompleted(ticket):
        log.info('waiting for batch %s download...', ticket)
        time.sleep(30)


def send_request(search_engine, queries, flags=None):
    configured = get_scraper_configured_for(search_engine, flags)
    configured.queries = flights_from_queries(queries)
    request = configured.CreateBatchRequest()
    ticket = configured.proxy.SendBatch(request)
    log.info('Sent batch request. Ticket %s', ticket)
    return ticket


def get_responses_batches(queries):
    google_ticket = send_request('google', queries)
    yandex_ticket = send_request('yandex', queries, flags={
        'exp_flags': 'avia=dev=1'
    })
    await_ticket('google', google_ticket)
    await_ticket('yandex', yandex_ticket)
    return {
        'google': get_ticket_result('google', google_ticket, len(queries)),
        'yandex': get_ticket_result('yandex', yandex_ticket, len(queries)),
    }


def get_wizard_service_answer(flight_number):
    return requests.get(WIZARD_URL, params={
        'content_flags': 'dev=1',
        'device': 'desktop',
        'geo_id': '213',
        'lang': 'ru',
        'main_reqid': 'dev',
        'reqid': 'dev',
        'tld': 'ru',
        'user_request': 'dev',
        'flight_number': flight_number
    }).json()


def is_same(google_flight_info, yandex_flight_info):
    google_time = google_flight_info['departure_time_scheduled']
    yandex_time = yandex_flight_info['departure_time_scheduled']
    #  Yandex is tz-unaware, google is tz-aware, but both return time in departure airport timezone
    return google_time.replace(tzinfo=None) == yandex_time


def format_departure_time(flight_info):
    return flight_info['departure_time_scheduled'].strftime('%Y-%m-%d %H:%M:%S')


def get_google_flight_infos(responses_batch, yandex_flight_infos, queries):
    google_flight_infos = []
    for query_result, yandex_flight_info, (flight_number, cnt) in izip(responses_batch, yandex_flight_infos, queries):
        try:
            lines = query_result['result_lines']
            if not lines:
                log.info('Scraper could not get google serp for flight number %s', flight_number)
                google_flight_infos.append(None)
                continue
            google_wizards = filter(None, [
                parse_google_wizard(get_html_tree(line['snippet']))
                for line in lines
            ])
            if len(google_wizards) == 0:
                log.info('No google wizard found for flight %s', flight_number)
                google_flight_infos.append(None)
                continue
            if len(google_wizards) != 1:
                log.error('Expected 1 flight wizard in google response for flight %s. Found %d.',
                          flight_number, len(google_wizards))
                google_flight_infos.append('Exists')
                continue
            [flight_infos] = google_wizards
            if not yandex_flight_info:
                google_flight_infos.append('Exists')
                continue
            default_flight_info = filter(lambda info: is_same(info, yandex_flight_info), flight_infos)
            if len(default_flight_info) != 1:
                google_departure_times = map(format_departure_time, flight_infos)
                yandex_departure_time = format_departure_time(yandex_flight_info)
                log.error('There should be exactly 1 flight_info in google wizard, '
                          'same as yandex default flight info for flight %s. Found %d.\n'
                          'Google flight departure times: %s\n'
                          'Yandex flight departure time: %s', flight_number, len(default_flight_info),
                          google_departure_times, yandex_departure_time)
                google_flight_infos.append('Exists')
                continue
            google_flight_infos.append(default_flight_info[0])
        except Exception:
            log.exception('Could not parse google flight wizard for flight %s', flight_number)
            google_flight_infos.append(None)
    return google_flight_infos


def parse_yandex_direction(flight, direction):
    content = flight[direction]
    scheduled = parse_datetime(content['scheduled_time'])
    actual = parse_datetime(content['time'])
    return {
        direction + '_gate': content['gate'],
        direction + '_terminal': content['terminal'],
        direction + '_time_scheduled': scheduled,
        direction + '_delay': int((actual - scheduled).total_seconds()) // 60,
    }


def get_yandex_flight_infos(queries, responses_batch):
    flight_infos = []
    for (flight_number, flights_cnt), query_result in izip(queries, responses_batch):
        if not query_result['result_lines']:
            log.info('Scraper could not get yandex serp for flight number %s', flight_number)
            flight_infos.append(None)
            continue
        if any(line['wizard-type'] == 'WIZARD_TICKET' for line in query_result['result_lines']):
            try:
                wizard_service_answer = get_wizard_service_answer(flight_number)
                if 'error' in wizard_service_answer:
                    log.info('Error returned by wizard service for flight %s: %s',
                             flight_number, wizard_service_answer['error'])
                    flight_infos.append(None)
                    continue
                tabs = wizard_service_answer['content']['tabs']
                default_tabs = filter(lambda tab: tab['default'], tabs)
                if len(default_tabs) != 1:
                    raise Exception('There should be exactly 1 default tab. Found %s' % len(default_tabs))
                flight = default_tabs[0]['flights'][0]
                flight_info = parse_yandex_direction(flight, 'arrival')
                flight_info.update(parse_yandex_direction(flight, 'departure'))
                flight_info['status'] = 'NO_INFO' if flight['status']['code'] == 'unknown' else flight['status']['code']
                flight_infos.append(flight_info)
            except Exception:
                flight_infos.append(None)
                log.exception('Could not parse data from yandex wizard source for %s.', flight_number)
        else:
            log.info('No yandex wizard found for %s', flight_number)
            flight_infos.append(None)
    return flight_infos


def get_flight_infos(queries):
    responses_batches = get_responses_batches(queries)
    yandex_flight_infos = get_yandex_flight_infos(queries, responses_batches['yandex'])
    return {
        'yandex': yandex_flight_infos,
        'google': get_google_flight_infos(responses_batches['google'], yandex_flight_infos, queries),
    }


def leave_only_answered_queries(flight_infos, queries):
    keys = flight_infos.keys()
    by_queries = zip(queries, *flight_infos.values())
    only_answered = filter(lambda for_query: all(answer not in (None, 'Exists') for answer in for_query), by_queries)
    return [
        {
            keys[0]: map(lambda line: line[1], only_answered),
            keys[1]: map(lambda line: line[2], only_answered),
        },
        map(lambda line: line[0], only_answered)
    ]


def create_empty_stats():
    return {
        '{}_{}'.format(name, search_engine): 0
        for name in ('status', 'departure', 'arrival', 'terminal', 'gate', 'wizard')
        for search_engine in ('yandex', 'google', 'total')
    }


def get_stats(queries):
    stats = create_empty_stats()
    stats['wizard_total'] += sum(map(lambda (query, cnt): cnt, queries))
    all_flight_infos = get_flight_infos(queries)
    for search_engine, infos in all_flight_infos.items():
        for flight_info, (query, queries_cnt) in zip(infos, queries):
            stats['wizard_%s' % search_engine] += queries_cnt if flight_info else 0
    flight_infos, queries = leave_only_answered_queries(all_flight_infos, queries)
    for search_engine, infos in flight_infos.items():
        for flight_info, (query, queries_cnt) in zip(infos, queries):
            update_stats_with_flight(stats, flight_info, search_engine, queries_cnt)
    queries_total = sum(map(lambda (query, cnt): cnt, queries))
    for name in ('status', 'departure', 'arrival', 'terminal', 'gate'):
        stats['{}_total'.format(name)] += queries_total
    stats['terminal_total'] += queries_total  # because there are two terminals - departure and arrival
    return stats


def update_stats_with_flight(stats, flight_info, search_engine, queries_cnt):
    for direction in ['departure', 'arrival']:
        if flight_info['%s_delay' % direction] != 0:
            stats[direction + '_' + search_engine] += queries_cnt
        if flight_info['%s_terminal' % direction]:
            stats['terminal_' + search_engine] += queries_cnt
    if flight_info['departure_gate']:  # ignore arrival gate
        stats['gate_' + search_engine] += queries_cnt
    if flight_info['status'] != 'NO_INFO':
        stats['status_' + search_engine] += queries_cnt


def split_on_batches(queries, batch_size):
    return [queries[start:start + batch_size] for start in range(0, len(queries), batch_size)]


def main():
    argparser = ArgumentParser()
    argparser.add_argument('-v', '--verbose', action='store_true')
    args = argparser.parse_args()
    if args.verbose:
        add_stdout_handler(log)
        add_stdout_handler(scraper_log)
    if settings.ENVIRONMENT not in ALLOWED_ENVS:
        log.info('Environment {} is not allowed', settings.ENVIRONMENT)
        return
    try:
        most_popular_queries = load_most_popular_queries()
        stats = create_empty_stats()
        for index, queries_batch in enumerate(split_on_batches(most_popular_queries, BATCH_SIZE)):
            try:
                time_of_start = datetime.now()
                batch_stats = get_stats(queries_batch)
                if datetime.now() - time_of_start > timedelta(hours=1):
                    raise Exception('Batch stats were calculating for too long. '
                                    'Not reliable anymore.')
            except Exception:
                log.exception('Could not prepare stats for batch %s', index)
            else:
                for key, value in batch_stats.iteritems():
                    stats[key] += value
        save_stats(stats)
    except Exception:
        log.exception('Could not prepare stats.')
        raise
