import yt.wrapper as yt
from datetime import datetime
import os
import random
from urllib2 import urlparse
import sys
import url_ndp
import math

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    clusters,
    Record,
    files
)

class PreMarkupProcess(object):
    assessments_filename = '01_assessments'
    freshness_filename = '02_freshness'
    url_samples_filename = '03_url_samples'
    owner_priorities_filename = '04_owner_priorities'
    owner_priorities_filtered_filename = '05_owner_priorities_filtered'
    high_priority_data_filename = '06_high_priority_data'
    high_priority_data_joined_filename = '07_high_priority_data_joined'
    data_for_markup_filename = '08_data_for_markup'
    merged_assessments_filename = '12_merged_assessments'

    def __init__(self, root_dir, logs_dir, log_days_range, prev_state, new_state, markup_quota, token):
        self.root_dir = root_dir
        self.logs_dir = logs_dir
        self.log_days_range = log_days_range
        self.prev_state = prev_state
        self.state = new_state
        self.state_path = os.path.join(self.root_dir, self.state)
        self.token = token
        self.markup_quota = markup_quota
        self.n_urls_to_sample = 100
        self.freshness_threshold = 10
        self.cluster = clusters.Hahn(token=self.token).env(templates=dict(job_root=self.root_dir))

        #if yt.exists(self.state_path):
        #    raise Exception("Current state path already exists")

    def create_state_from_assessments(self):
        yt.update_config({"proxy": {"url": "hahn.yt.yandex.net"}, "token": self.token})
        yt.mkdir(self.state_path)
        yt.copy(os.path.join(self.root_dir, self.prev_state, self.merged_assessments_filename), \
                os.path.join(self.state_path, self.assessments_filename))

    def calc_freshness(self):
        job = self.cluster.job()
        log = job.table(os.path.join(self.state_path, self.assessments_filename))

        log.groupby('owner')\
                .reduce(reduce_owner_freshness)\
                .put(os.path.join(self.state_path, self.freshness_filename))

        job.run()

    def sample_data_for_markup(self):
        job = self.cluster.job()
        log = job.table(os.path.join(self.logs_dir, self.log_days_range))
        freshness = job.table(os.path.join(self.state_path, self.freshness_filename))

        owner_urls = log.map(parse_log, files=[
                                                files.LocalFile('areas.lst'),
                                                files.LocalFile('liburl_ndp.so'),
                                                files.LocalFile('url_ndp.py')
                                                ]) \
                        .groupby('owner')

        url_samples, owner_counters = owner_urls.reduce(SplitUrlsAndCounters(n_urls=self.n_urls_to_sample))
        url_samples.put(os.path.join(self.state_path, self.url_samples_filename))

        owner_counters.join(freshness, by='owner', type='left') \
                      .map(CalcPriority(freshness_threshold=self.freshness_threshold)) \
                      .put(os.path.join(self.state_path, self.owner_priorities_filename)) \
                      .filter(nf.custom(FilterFreshness(threshold=self.freshness_threshold), 'freshness')) \
                      .put(os.path.join(self.state_path, self.owner_priorities_filtered_filename)) \
                      .groupby() \
                      .sort('neg_priority') \
                      .reduce(TakeUntilQuotaExceeds(quota=self.markup_quota)) \
                      .put(os.path.join(self.state_path, self.high_priority_data_filename)) \
                      .join(url_samples, by='owner', type='inner', assume_small_left=True) \
                      .put(os.path.join(self.state_path, self.high_priority_data_joined_filename)) \
                      .map(sample_urls_for_markup) \
                      .put(os.path.join(self.state_path, self.data_for_markup_filename))

        job.run()

        return os.path.join(self.state_path, self.data_for_markup_filename)

def reduce_owner_freshness(groups):
    now_date = datetime.now()
    for key, records in groups:
        cnt = 0
        s_freshness = 0
        for r in records:
            mark_date = datetime.fromtimestamp(r.timestamp / 1000)
            dates_diff = now_date - mark_date
            months_diff = int(max(dates_diff.days, 0) / 30)
            freshness = max(0.0, 1.0 - 0.1 * months_diff)

            s_freshness += freshness
            cnt += 1

        yield Record(owner=key.owner,
                    freshness=s_freshness,
                    n_marks=cnt)

def parse_log(records):
    whitelist_starts = ('yandex.',)
    whitelist_match = ('vk.com',
                        'vkontakte.ru',
                        'ok.ru',
                        'odnoklassniki.ru',
                        'fb.com',
                        'facebook.com',
                        'yandex.ru',
                        'yandex.net')
    whitelist_ends = tuple('.' + host for host in whitelist_match)

    owner_formatter = url_ndp.Formatter('%y', ':'.join(['areas.lst']))
    for r in records:
        record_params = dict(kv.split('=', 1) for kv in r.value.split('\t') if '=' in kv)

        if record_params.get('service', '') != 'images.yandex':
            continue

        if record_params.get('ui', '') not in ['images.yandex', 'images.yandex/touch']:
            continue

        if record_params.get('stype', '') not in ['image', 'image_touch']:
            continue

        for key, value in record_params.items():
            if not key.startswith('ans'):
                continue

            url_params = dict(kv.split('=', 1) for kv in value.split('\\t') if '=' in kv)

            if 'img_url' not in url_params:
                continue

            try:
                url_encoded = url_params['img_url'].encode('utf8')
            except UnicodeDecodeError:
                continue

            if not (url_encoded.startswith('https://') or url_encoded.startswith('http://')):
                continue

            try:
                parsed_url = urlparse.urlparse(url_encoded)
                parsed_url = parsed_url._replace(netloc=parsed_url.netloc.decode('utf8').encode('idna'))
                url_encoded = urlparse.urlunparse(parsed_url)
            except (ValueError, UnicodeDecodeError):
                continue

            url_unquoted = urlparse.unquote(url_encoded)
            img_url_parts = url_unquoted.split('/')
            if len(img_url_parts) < 3:
                continue

            domain = img_url_parts[2]
            owner = owner_formatter.FormatUrl(str(domain))

            if owner.startswith(whitelist_starts):
                continue
            if owner in whitelist_match:
                continue
            if owner.endswith(whitelist_ends):
                continue

            yield Record(owner=owner,
                         domain=domain,
                         url=url_encoded.decode('utf8'))


class SplitUrlsAndCounters(object):
    def __init__(self, n_urls):
        self.n_urls = n_urls

    def __call__(self, groups, output_urls, output_counters):
        for key, records in groups:
            total = 0
            urls = []

            for r in records:
                total += 1
                if total <= self.n_urls:
                    urls.append(r.url)

            output_urls(Record(owner=key.owner, urls=urls))
            output_counters(Record(owner=key.owner, total=total))


class TakeUntilQuotaExceeds(object):
    def __init__(self, quota):
        self.quota = quota

    def __call__(self, groups):
        s = 0
        for key, records in groups:
            for r in records:
                if s + r.missing_marks <= self.quota:
                    s += r.missing_marks
                    yield r
                else:
                    s = self.quota + 1


def sample_urls_for_markup(records):
    for r in records:
        n_urls = int(min(len(r.urls), r.missing_marks))
        for url in random.sample(r.urls, n_urls):
            yield Record(owner=r.owner,
                        url=url,
                        missing_marks=r.missing_marks,
                        priority=r.priority,
                        total=r.total)


class FilterFreshness(object):
    def __init__(self, threshold):
        self.threshold = threshold

    def __call__(self, freshness):
        return freshness < self.threshold


class CalcPriority(object):
    def __init__(self, freshness_threshold):
        self.freshness_threshold = freshness_threshold

    def __call__(self, records):
        for r in records:
            freshness = float(r.get('freshness', 0.0))
            incompleteness = max(0.0, self.freshness_threshold - freshness)
            priority = math.log(r.total) * incompleteness
            missing_marks = int(incompleteness + 0.99)
            yield Record(owner=r.owner,
                         priority=priority,
                         neg_priority=-priority,
                         freshness=freshness,
                         total=r.total,
                         missing_marks=missing_marks)


def main(*args):
    params, in2, in3, token, any_param, html_file = args

    root_dir = params[0]['root_dir']
    logs_dir = params[0]['logs_dir']
    prev_state = params[0]['prev_state']
    new_state = params[0]['new_state']
    log_days_range = params[0]['days_range']
    markup_quota = params[0]['markup_quota']

    print >>sys.stderr, "before init"
    pre_markup = PreMarkupProcess(root_dir=root_dir,
                                logs_dir=logs_dir,
                                log_days_range=log_days_range,
                                prev_state=prev_state,
                                new_state=new_state,
                                markup_quota=markup_quota,
                                token=token)
    print >>sys.stderr, "init success"
    pre_markup.create_state_from_assessments()
    print >>sys.stderr, "create state success"
    pre_markup.calc_freshness()
    print >>sys.stderr, "freshness success"
    markup_table_path = pre_markup.sample_data_for_markup()
    print >>sys.stderr, "sample for markup success"

    return [{'cluster': 'hahn', 'table': markup_table_path}]
