#!/usr/bin/env python
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function
import sys
import os
import re
import arrow
import requests
from tqdm import tqdm
from cStringIO import StringIO
import time
import codecs
import contextlib
import itertools
import logging
import toml
import pdb
import argparse
import tempfile
import traceback
import subprocess
import smtplib
import mapreducelib
import threading
import urllib
import json
import yaml
import uuid
from time import sleep
try:
    import thread
except ImportError:
    import _thread as thread
import urlparse
from collections import defaultdict, Counter, namedtuple, deque
import datetime as dt
from datetime import datetime as dtdt

# PATH = ['12.1620.705', '12.1620.486']

PATH = {'12.1620.705': 'install',
        '12.1620.486': 'cancel'}
Parameter = namedtuple('Parameter', ['element', 'eventtype',
                                     'lang'])
Ident = namedtuple('Ident', ['product', 'service', 'lang',
                             'browser', 'description', 'element', 'candidate'])
browser_dict = {}

SERVER = "localhost"
RECIPIENTS = ['riddle', 'pecheny']
SUBJECT = "AtomBanners stat uploader"
FROM = "stat-uploader@monitoring1.haze.yandex.net"

statdict = defaultdict(lambda: {})


def send_email(recipients, message):
    recipients = ['{}@yandex-team.ru'.format(rec) for rec in recipients]
    body = """\
From: {}
To: {}
Subject: {}

{}
""".format(FROM, ", ".join(recipients), SUBJECT, message).encode('utf8')
    server = smtplib.SMTP(SERVER)
    server.sendmail(FROM, recipients, body)
    server.quit()


@contextlib.contextmanager
def make_temp_file(**kwargs):
    temp_file = tempfile.mkstemp(**kwargs)
    yield temp_file
    os.remove(temp_file[1])


def deutf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        key, subkey, value = rec.key, rec.subkey, rec.value
        if not isinstance(key, unicode):
            key = key.decode('utf8', errors='replace')
        if not isinstance(subkey, unicode):
            subkey = subkey.decode('utf8', errors='replace')
        if not isinstance(value, unicode):
            value = value.decode('utf8', errors='replace')
        return Record(key, subkey, value)
    elif isinstance(rec, str):
        rec = rec.decode('utf8', errors='replace')
    return rec


def utf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        if isinstance(rec.key, unicode):
            rec.key = rec.key.encode('utf8')
        if isinstance(rec.subkey, unicode):
            rec.subkey = rec.subkey.encode('utf8')
        if isinstance(rec.value, unicode):
            rec.value = rec.value.encode('utf8')
        return rec
    elif isinstance(rec, unicode):
        rec = rec.encode('utf8')
    return rec


def tryint(string):
    try:
        return int(string)
    except:
        return -1


def dttots(dt_):
    return int((dt_ - dtdt(1970, 1, 1)).total_seconds())


def parsevars(vars):
    commas = vars.split(',')
    result = {'clids': []}
    for x in commas:
        if len(x.split('=')) > 1:
            key = x.split('=')[0]
            value = '='.join(x.split('=')[1:])
            if key.startswith('-'):
                key = key[1:]
            if key.startswith('clid'):
                result['clids'].append(value)
            else:
                result[key] = value
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def parseparams(value):
    tabs = value.split('\t')
    result = {}
    for x in tabs:
        if len(x.split('=')) > 1:
            result[x.split('=')[0]] = '='.join(x.split('=')[1:])
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def tabulate(*args):
    return '\t'.join(map(format, args))


def ntabulate(*args):
    return tabulate(*args) + '\n'


def gettld(url):
    if not '//' in url:
        url = 'http://' + url
    parsed = urlparse.urlparse(url)
    nl = parsed.netloc
    return nl.split('.')[-1].split(':')[0]


def format_ident(fd, ident, ident_dict, ident2=None):
    if not ident2:
        ident2 = ident
    return ntabulate(
        fd,
        ident.product,
        ident.service,
        ident.lang,
        ident.browser,
        ident.description,
        ident.element,
        ident.candidate,
        ident_dict[ident2]['shows'],
        ident_dict[ident2]['installs'],
        ident_dict[ident2]['cancels'],
        ident_dict[ident2]['clicks'])


def ident_from_line(tabs):
    return Ident(
        product=tabs[1],
        service=tabs[2],
        lang=tabs[3],
        browser=tabs[4],
        description=tabs[5],
        element=tabs[6],
        candidate=tabs[7],
    )


def debug_tqdm(x, debug):
    if debug:
        return tqdm(x)
    return x

HEADERS = {'StatRobotUser': 'robot_pecheny',
           'StatRobotPassword': 'OoGh1Adahy'}
URL = 'https://upload.stat.yandex-team.ru/_api/report/data'
DICTAPI = 'https://stat.yandex-team.ru/_api/dictionary'


def get_lastdate():
    if os.path.isfile('daily_stat_dict_last'):
        with open('daily_stat_dict_last') as f:
            ts = f.read()
        return arrow.get(ts)
    return arrow.get(0).to('Europe/Moscow').date()


def set_lastdate(ts):
    with open('daily_stat_dict_last', 'w') as f:
        f.write(ts.strftime('%Y-%m-%d'))


def tsfile(tfile):
    result = tfile.split('.tsv.log')[0]
    result = result.split('tsv_data_')[1]
    return arrow.get(result + '+03:00')


def get_srcfiles(lb=None, ub=None, allfiles=None):
    if not allfiles:
        allfiles = get_allfiles()
    if not lb:
        lb = get_lastdate()
    if not ub:
        ub = arrow.get('2025-01-01').date()
    result = [x for x in allfiles if tsfile(x) > lb and tsfile(x) <= ub]
    return sorted(result)


def get_allfiles():
    os.chdir('logs')
    allfiles = [os.path.join(os.path.dirname(os.path.abspath(x)), x)
                for x in os.listdir(os.getcwd()) if x.startswith('tsv_data')]
    os.chdir('..')
    return allfiles


def get_full_dates(srcfiles):
    result = defaultdict(lambda: set())
    result1 = defaultdict(lambda: [])
    for x in sorted(srcfiles):
        try:
            result[tsfile(x).date()].add(tsfile(x))
            result1[tsfile(x).date()].append(x)
        except:
            logger.error(traceback.format_exc())
            pass
    fulldates = sorted(x for x in result if len(result[x]) == 48)
    return {x: result1[x] for x in fulldates}


def crop_dates(datesdict, lb=None, ub=None):
    if not lb:
        lb = dt.date(2016, 2, 11)
    if not ub:
        ub = dt.date(2025, 1, 1)
    for x in datesdict:
        if x < lb or x > ub:
            datesdict.pop(x)
    return datesdict


def main():
    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2
    import arrow

    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--config', default=None)
    parser.add_argument('--datefrom', default=None)
    parser.add_argument('--dateto', default=None)
    args = parser.parse_args()
    start = int((dtdt.now() - dtdt(1970, 1, 1)).total_seconds())

    logger = logging.getLogger(_file_[:-3])
    formatter = logging.Formatter('%(asctime)s | %(message)s')
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if args.debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler('{}/logs/{}-{}.log'.format(
        os.path.dirname(__file__), _file_[:-3], start),
        encoding='utf8')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # load config
    with open('basic.toml', 'r') as f:
        config = toml.loads(f.read())
    os.chdir(os.path.dirname(__file__))
    with open('distribution.toml', 'r') as f:
        config.update(toml.loads(f.read()))
    if args.config is None:
        config.update(toml.loads(open('fastlogs_stat_uploader.toml').read()))
    else:
        config.update(toml.loads(open(args.config).read()))

    if not(args.datefrom and args.dateto):
        lb = get_lastdate()
        srcdates = crop_dates(get_full_dates(get_allfiles()), lb)
        while len(srcdates) > 0:
            t = process_files(srcdates[sorted(srcdates)[0]], logger,
                              config, debug=args.debug)
            processed_ts = tsfile(srcdates[sorted(srcdates)[0]][0]).date()
            lb = get_lastdate()
            if t and processed_ts > lb:
                set_lastdate(processed_ts)
            srcdates = crop_dates(get_full_dates(get_allfiles()), lb)
        logger.info("No new data. Latest counted ts is {}"
                    .format(lastts))
        sys.exit(0)
    else:
        lb = arrow.get(args.datefrom, 'YYYYMMDD').date()
        ub = arrow.get(args.dateto, 'YYYYMMDD').date()
        srcdates = crop_dates(get_full_dates(get_allfiles), lb, ub)
        for srcdate in sorted(srcdates):
            t = process_files(srcdates[srcdate], logger, config, append=False,
                              debug=args.debug)


def zmax(seq):
    if len(seq) > 0:
        return max(seq)
    return -1


def get_set_dictionary(dictionary, value):
    if value in dictionary:
        return dictionary[value]
    dictionary[value] = zmax(set(dictionary.values())) + 1
    return dictionary[value]


def get_dictionary_from_stat(name):
    req = requests.get(
        DICTAPI + '?name={}'.format(name),
        headers=HEADERS
    )
    if req.status_code == 200:
        return {int(k): v for k, v in json.loads(req.content).items()}
    return req


def post_dictionary_to_stat(dictionary, name):
    req = requests.post(
        DICTAPI,
        headers=HEADERS,
        data={
            'name': name,
            'language': '',
            'dictionary': json.dumps(dictionary),
            'editors': ['pecheny', 'riddle']
        }
    )
    return req


def process_ident(ident):
    product, service, \
        lang, browser, description, \
        element, candidate = ident
    try:
        product = get_set_dictionary(statdict['product'], product)
    except:
        pdb.set_trace()
    service = get_set_dictionary(statdict['service'], service)
    lang = get_set_dictionary(statdict['lang'], lang)
    browser = get_set_dictionary(statdict['browser'], browser)
    description = get_set_dictionary(statdict['description'], description)
    element = get_set_dictionary(statdict['element'], element)
    candidate = get_set_dictionary(statdict['candidate'], candidate)
    return [product, service,
            lang, browser, description,
            element, candidate]


def yaml_dump(dictionary):
    return yaml.safe_dump(dictionary, default_flow_style=False,
                          explicit_start=True)


def reverse_dict(dictionary):
    return {v: k for k, v in dictionary.items()}


def yaml_dump_all():
    for d in statdict:
        with codecs.open('{}.yaml'.format(d), 'w', 'utf8') as f:
            f.write(yaml_dump(
                reverse_dict(statdict[d])
            ))


def process_files(srcfiles, logger, config, append=True, debug=False):
    logger.info('Source file is {}'.format(srcfiles))
    ts = tsfile(srcfiles[0]).date()
    tempfn = unicode(uuid.uuid4())
    for srcfile in srcfiles:
        logger.info('Merging "{}" into "{}"...'.format(srcfile, tempfn))
        os.system('cat "{}" >> "{}"'.format(srcfile, tempfn))
    logger.info('Sorting "{}"...'.format(tempfn))
    os.system('LC_ALL=C sort "{}" -o "{}"'.format(tempfn, tempfn))
    c_data = deque()
    c_data.append(tabulate('fielddate', 'product', 'service',
                           'lang', 'browser', 'description', 'element', 'candidate',
                           'shows', 'installs', 'cancels', 'clicks'))
    fd = ts.strftime('%Y-%m-%d') + '\t'
    logger.info('Getting dicts from statface...')
    for x in ['product', 'service', 'lang', 'browser',
              'description', 'element', 'candidate']:
        statdict[x] = reverse_dict(get_dictionary_from_stat(
            'vcfs::atombanners_{}'.format(x)))
    logger.info('Gathering data from {}...'.format(tempfn))
    ready = []
    c_data = StringIO()
    i = 0
    print(tabulate('fielddate', 'product', 'service',
                   'lang', 'browser', 'description', 'element', 'candidate',
                   'shows', 'installs', 'cancels', 'clicks'), file=c_data)
    with codecs.open(tempfn, 'r', 'utf8') as tf2:
        prev_ident = None
        shows = 0
        clicks = 0
        cancels = 0
        installs = 0
        for line in debug_tqdm(tf2, debug):
            i += 1
            if i % 3000000 == 0:
                ready.append(c_data.getvalue())
                c_data.close()
                c_data = StringIO()
                print(tabulate('fielddate', 'product', 'service',
                               'lang', 'browser', 'description', 'element', 'candidate',
                               'shows', 'installs', 'cancels', 'clicks'), file=c_data)
            tabs = line.rstrip().replace('\u0000', '').split('\t')
            if len(tabs) > 12:
                logger.info('Error on line {}'.format(line.rstrip()))
                continue
            try:
                ident = tabs[1:8]
            except IndexError:
                continue
            if ident != prev_ident and prev_ident:
                c_data.write(fd)
                print(tabulate(*(process_ident(prev_ident) +
                                 [shows,
                                  installs,
                                  cancels,
                                  clicks])), file=c_data)
                try:
                    shows = int(tabs[8])
                    installs = int(tabs[9])
                    cancels = int(tabs[10])
                    clicks = int(tabs[11])
                except IndexError:
                    logger.error('Broken line: {}'.format(line.rstrip()))
                    continue
            else:
                try:
                    shows += int(tabs[8])
                    installs += int(tabs[9])
                    cancels += int(tabs[10])
                    clicks += int(tabs[11])
                except IndexError:
                    logger.error('Broken line: {}'.format(line.rstrip()))
                    continue
            prev_ident = ident
        c_data.write(fd)
        print(tabulate(*(process_ident(prev_ident) +
                         [shows,
                          installs,
                          cancels,
                          clicks])), file=c_data)
        ready.append(c_data.getvalue())
        c_data.close()
    os.remove(tempfn)
    yaml_dump_all()
    logger.info('Ready to post data')
    pdb.set_trace()
    pool = StringIO()
    i = 0
    for c_data in ready:
        data_counters = {
            "name": "Distribution/Others/AtomBanners/v3_daily",
            "scale": "d",
            "_append_mode": (1 if append else 0),
            "tsv_data": c_data,
        }
        req = None
        retries = 0
        while (req is None or req.status_code != 200) and not (
            (not req is None)
                and b'Error in data' in req.content) and retries < 5:
            req = requests.post(URL, headers=HEADERS, data=data_counters)
            logger.info(req.text)
            if req.status_code != 200:
                time.sleep(60)
                retries += 1
        if b'Error in data' in req.content:
            send_email(RECIPIENTS, '{} seems to have invalid data'.format(ts))
        if req.status_code != 200 and retries >= 5:
            send_email(RECIPIENTS, 'Upload of data from {} failed'.format(ts))
    return True


if __name__ == "__main__":
    main()
