# -*- coding: utf-8 -*-
import string
import tarfile
import json
from datetime import datetime
from collections import defaultdict

import utils

CYRILLIC_LETTERS = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
LANGUAGE_IDS = {
    'en': 0,
    'eng': 0,
    'ru': 1,
    'rus': 1
}
WORDLIST_SKIP_CHARACTERS_MAX = 0
CHECKUP_SEGMENTS = [None, "checkup_valid", "checkup_valid_with_capitals", "checkup_valid_with_punctuation", "checkup_valid_with_digits"]
FACTOR_NAMES = [
    "has_capital_characters",
    "has_cyrillic_characters",
    "has_digits",
    "has_latin_characters",
    "has_punctuation",
    "ocr_answer_levenshtein_distance",
    "ocr_answer_length_difference",
    "ocr_confidence",
    "ocr_language",
    "position_difference_abs_ratio",
    "position_difference_ratio",
    "position_left_ratio",
    "position_right_ratio",
    "response_time_avg",
    "response_time_max",
    "response_time_min",
    "response_time_min_normalized_to_avg",
    "response_time_min_normalized_to_max",
    "response_time_avg_normalized_to_current_answer_length",
    "response_time_perc50_normalized_to_current_answer_length",
    "response_time_perc25",
    "response_time_perc50",
    "response_time_perc75",
    "time_of_day_avg",
    "time_of_day_perc25",
    "time_of_day_perc50",
    "time_of_day_perc75",
]
SEGMENTED_FACTOR_NAMES = [
    "current_answer_count",
    "current_answer_ratio",
    "current_answer_lowercased_ratio",
    "current_answer_to_most_popular_answer_ratio",
    "most_popular_answer_levenshtein_distance",
    "unique_answers_count",
    "unique_answers_count_normalized",
    "total_answers",
]

def segmented_factor_name(name, segment):
    if segment is None:
        return name
    return "%s_among_%s"%(name, segment)

def wordlist_factor_names(word_lists=[]):
    result = []
    for wordlist in word_lists:
        for i in xrange(WORDLIST_SKIP_CHARACTERS_MAX+1):
            for j in xrange(WORDLIST_SKIP_CHARACTERS_MAX+1):
                result.append("in_%s_wordlist_skipleft_%d_skipright_%d"%(wordlist["metadata"]["id"], i, j))
    return result

def factor_names(word_lists=[]):
    result = list(FACTOR_NAMES)

    for name in SEGMENTED_FACTOR_NAMES:
        for segment in CHECKUP_SEGMENTS:
            result.append(segmented_factor_name(name, segment))

    for segment in CHECKUP_SEGMENTS:
        if segment is not None:
            result.append("current_answer_%s_ratio"%segment)

    result.extend(wordlist_factor_names(word_lists))
    return result

def load_word_list(tarball):
    def add_word(words, word):
        for i in xrange(WORDLIST_SKIP_CHARACTERS_MAX+1):
            for j in xrange(WORDLIST_SKIP_CHARACTERS_MAX+1):
                part = word[i:]
                if j:
                    part = word[:-j]
                if part:
                    words.setdefault(part, set()).add((i, j))

    result = {}
    with tarfile.open(fileobj=tarball, mode="r|*") as tar:
        for tarinfo in tar:
            content = tar.extractfile(tarinfo)
            if tarinfo.name == "words":
                words = {}
                for line in content:
                    add_word(words, line.decode("utf-8").strip())
                result["words"] = words
            elif tarinfo.name == "metadata.json":
                result["metadata"] = json.load(content)
            else:
                raise RuntimeError("Unexpected file %s in words archive"%repr(tarinfo.name))
    if "words" not in result:
        raise RuntimeError('No "words" file in words archive')
    if "metadata" not in result:
        raise RuntimeError('No "metadata" file in words archive')
    return result

def seconds_since_midnight(timestamp):
    time = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
    midnight = time.replace(hour=0, minute=0, second=0, microsecond=0)
    return (time - midnight).total_seconds()

def calculate_stats(iterable, prefix=''):
    lst = []
    value_sum = .0
    for value in iterable:
        value = float(value)
        lst.append(value)
        value_sum += value
    lst.sort()
    if prefix:
        prefix += '_'
    return {
        prefix+'avg': value_sum/len(lst),
        prefix+'min': lst[0],
        prefix+'perc25': lst[len(lst)/4],
        prefix+'perc50': lst[len(lst)/2],
        prefix+'perc75': lst[len(lst)*3/4],
        prefix+'max': lst[-1]
    }

def has_punctuation(s):
    return any(c in string.punctuation for c in s)

def has_capitals(s):
    return any(c.isupper() for c in s)

def has_digits(s):
    return any(c.isdigit() for c in s)

def checkup_segments(answer):
    if answer["levenshtein_distance"] > 0:
        return []

    checkup = answer["checkup"]
    result = ["checkup_valid"]
    if has_punctuation(checkup):
        result.append("checkup_valid_with_punctuation")
    if has_capitals(checkup):
        result.append("checkup_valid_with_capitals")
    if has_digits(checkup):
        result.append("checkup_valid_with_digits")
    return result

def calculate_segmented_factors(unique_answer, unique_answers, ocr_data, word_lists):
    answers = unique_answers[unique_answer]
    total_answers = sum(len(recs) for recs in unique_answers.itervalues())
    most_popular_answer = max(unique_answers, key=lambda k: len(unique_answers[k])) if unique_answers else None
    factors = {
        "current_answer_count": len(answers),
        "current_answer_ratio": float(len(answers))/total_answers if total_answers != 0 else .0,
        "unique_answers_count": len(unique_answers),
        "unique_answers_count_normalized": float(len(unique_answers))/total_answers if total_answers != 0 else .0,
        "most_popular_answer_levenshtein_distance": utils.levenshtein_distance(unique_answer, most_popular_answer) if most_popular_answer is not None else -1,
        "current_answer_to_most_popular_answer_ratio": float(len(answers))/len(unique_answers[most_popular_answer]) if unique_answers.get(most_popular_answer) else .0,
        "current_answer_lowercased_ratio": float(len(unique_answers.get(unique_answer.lower(), [])))/total_answers if total_answers else .0,
        "total_answers": total_answers,
    }
    return factors

def calculate_factors(unique_answers, ocr_data, word_lists=[]):
    assert unique_answers

    assert isinstance(ocr_data["ocr_answer"], unicode)

    segmented_answers = defaultdict(lambda: defaultdict(list))
    for unique_answer, answers in unique_answers.iteritems():
        for answer in answers:
            for seg in checkup_segments(answer):
                segmented_answers[seg][unique_answer].append(answer)
    segmented_answers[None] = unique_answers

    for unique_answer, answers in unique_answers.iteritems():
        assert isinstance(unique_answer, unicode)
        position_count = {"left": 0, "right": 0}
        for answer in answers:
            position_count[answer["position"]] += 1
        assert position_count["left"] + position_count["right"] == len(answers)

        factors = {
            "ocr_confidence": ocr_data["ocr_confidence"],
            "ocr_language": LANGUAGE_IDS[ocr_data["ocr_language"]],
            "has_latin_characters": int(any(c in string.ascii_letters for c in unique_answer)),
            "has_cyrillic_characters": int(any(c in CYRILLIC_LETTERS for c in unique_answer)),
            "has_capital_characters": int(has_capitals(unique_answer)),
            "has_digits": int(has_digits(unique_answer)),
            "has_punctuation": int(has_punctuation(unique_answer)),
            "position_left_ratio": float(position_count["left"])/len(answers),
            "position_right_ratio": float(position_count["right"])/len(answers),
            "position_difference_ratio": float(position_count["left"]-position_count["right"])/(position_count["left"]+position_count["right"]),
            "ocr_answer_levenshtein_distance": utils.levenshtein_distance(unique_answer, ocr_data["ocr_answer"]),
            "ocr_answer_length_difference": len(unique_answer) - len(ocr_data["ocr_answer"]),
        }
        factors["position_difference_abs_ratio"] = abs(factors["position_difference_ratio"])
        factors.update(calculate_stats((a["timer"] for a in answers), prefix="response_time"))
        factors["response_time_min_normalized_to_max"] = factors["response_time_min"]/factors["response_time_max"]
        factors["response_time_min_normalized_to_avg"] = factors["response_time_min"]/factors["response_time_avg"]
        factors["response_time_avg_normalized_to_current_answer_length"] = factors["response_time_avg"]/(len(unique_answer)+1)
        factors["response_time_perc50_normalized_to_current_answer_length"] = factors["response_time_perc50"]/(len(unique_answer)+1)
        time_of_day_stats = calculate_stats((seconds_since_midnight(a["timestamp"]) for a in answers), prefix="time_of_day")
        del time_of_day_stats["time_of_day_min"]
        del time_of_day_stats["time_of_day_max"]
        factors.update(time_of_day_stats)
        for wordlist in word_lists:
            for i in xrange(WORDLIST_SKIP_CHARACTERS_MAX+1):
                for j in xrange(WORDLIST_SKIP_CHARACTERS_MAX+1):
                    fname = "in_%s_wordlist_skipleft_%d_skipright_%d"%(wordlist["metadata"]["id"], i, j)
                    factors[fname] = int((i,j) in wordlist["words"].get(unique_answer, []))

        for seg in CHECKUP_SEGMENTS:
            segmented_unique_answers = segmented_answers[seg]
            segmented_factors = calculate_segmented_factors(unique_answer, segmented_unique_answers, ocr_data, word_lists)
            for name, value in segmented_factors.iteritems():
                name = segmented_factor_name(name, seg)
                factors[name] = value

            if seg is not None:
                current_answer_count_name = segmented_factor_name("current_answer_count", seg)
                factors["current_answer_%s_ratio"%seg] = float(factors[current_answer_count_name])/factors["current_answer_count"]

        yield unique_answer, factors
