import urllib
import pandas as pd
import codecs
import multiprocessing
from bs4 import BeautifulSoup
import argparse
import numpy as np


def argument_parser():
    parser = argparse.ArgumentParser(description='Get parameters')
    parser.add_argument('-i', dest='input', type=str, required=True, help='filepath to tsv with htmls')
    parser.add_argument('-cpu', dest='pool', type=int, required=True, help='number of pools for multiprocessing')
    parser.add_argument('-p', dest='platform', type=str, required=True, help='desktop or touch')
    parser.add_argument('-o', dest='out_df', type=str, required=True, help='output df filepath')

    args = parser.parse_args()
    return args


class_dict_desktop = {
    'yandex_bno': (['t-construct-adapter__bno'], []),
    'yandex_video': (['t-construct-adapter__videowiz'], []),
    'yandex_geo': (['t-construct-adapter__companies'], []),
    'yandex_entity': (['t-construct-adapter__entity-card'], []),
    'yandex_images': (['t-construct-adapter__images'], []),
    'yandex_fact': ([
                        't-construct-adapter__suggest-fact',
                        't-construct-adapter__entity-fact',
                        't-construct-adapter__math',
                        't-construct-adapter__drugs',
                        't-construct-adapter__colors',
                        't-construct-adapter__fact-instruction',
                        't-construct-adapter__converter',
                        't-construct-adapter__time',
                    ], []),
    'google_bno': (['mslg'], []),
    'google_video': ([
                         'twQ0Be',
                         'BFJZOc',
                         'COEoid'], []),
    'google_geo': (['xERobd', 'luibr'], []),
    'google_entity': (['farUxc', 'KsUr1', 'TieM1d'], ['luibr', 'twQ0Be', 'HwtpBd']),
    'google_images': (['rg_r', 'rKSBKd'], []),
    'google_fact': (['qtR3Y', 'vk_ans', 'c2xzTb', 'kpd-ans', 'rYczAc'], []),
}


class_dict_touch = {
    'yandex_bno': (['t-construct-adapter__bno'], []),
    'yandex_video': (['t-construct-adapter__videowiz'], []),
    'yandex_geo': (['t-construct-adapter__companies'], []),
    'yandex_entity': (['t-construct-adapter__entity-card'], []),
    'yandex_images': (['t-construct-adapter__images'], []),
    'yandex_fact': ([
        't-construct-adapter__suggest-fact',
        't-construct-adapter__entity-fact',
        't-construct-adapter__math',
        't-construct-adapter__drugs',
        't-construct-adapter__colors',
        't-construct-adapter__fact-instruction',
        't-construct-adapter__converter',
        't-construct-adapter__time',
    ], []),
    'google_bno': (['Lgnr0e'], []),
    'google_video': (['kGH5dd', 'twQ0Be', 'HfB6re', 'XqIXXe'], []),
    'google_geo': (['QMle0e', 'qIX5B'], []),
    'google_entity': (['farUxc', 'KsUr1'], ['qIX5B', 'twQ0Be', 'HwtpBd', 'QMle0e']),
    'google_images': (['bUNBRd', 'rKSBKd'], []),
    'google_fact': (['qtR3Y', 'vk_ans', 'c2xzTb', 'kpd-ans', 'HwtpBd', 'CR33Se'], ['kno-kp'])
}


def find_google_elem(elem_list, elem_id):
    for child in elem_list.children:
        try:
            if child.has_attr(elem_id):
                yield child
            else:
                for good_elem in find_google_elem(child, elem_id):
                    if good_elem is not None:
                        yield good_elem
        except:
            yield None


def check_class(class_keys, serp_item):
    is_class = False
    for good_key in class_keys[0]:
        if ('class' in serp_item.attrs) and (good_key in serp_item.attrs['class']):
            is_class = True
        if len(list(serp_item.find_all(attrs={'class': good_key}))) > 0:
            is_class = True
    for bad_key in class_keys[1]:
        if ('class' in serp_item.attrs) and (bad_key in serp_item.attrs['class']):
            is_class = False
        if len(list(serp_item.find_all(attrs={'class': bad_key}))) > 0:
            is_class = False
    return is_class


def find_tags_desktop(html_url, class_dict):
    try:
        parsed_html = urllib.urlopen(html_url).read()
        soup = BeautifulSoup(parsed_html, 'html.parser')
        tags_list = {}
        serp_items_left = []
        serp_items_right = []
        content_left = soup.find_all(attrs={'class': 'content__left'})
        content_right = soup.find_all(attrs={'class': 'content__right'})
        if len(content_left) > 0:
            serp_items_left = content_left[0].find_all(attrs={'class': 'serp-item'})
        if len(content_right) > 0:
            serp_items_right = content_right[0].find_all(attrs={'class': 'serp-item'})

        if len(serp_items_left) == 0:
            try:
                content_left = soup.find_all(attrs={'id': 'rso'})
                content_right = soup.find_all(attrs={'id': 'rhs'})
            except:
                content_left = soup.find_all(attrs={'id': 'main'})
                content_right = []
            serp_items_left = []
            serp_items_right = []

            for good_item in find_google_elem(content_left[0], 'data-hveid'):
                serp_items_left.append(good_item)
            if len(content_right) > 0:
                for good_item in find_google_elem(content_right[0], 'data-hveid'):
                    serp_items_right.append(good_item)
            # serp_items = [x for x in serp_data.findChildren() if x.has_key('data-hveid')]
        all_serp_items = [(-1, x) for x in serp_items_right] + list(enumerate(serp_items_left))
        for key in class_dict:
            for n, serp_item in all_serp_items:
                if check_class(class_dict[key], serp_item):
                    tags_list[key] = n
                    break
        return tags_list
    except:
        print 'error: {}'.format(html_url)
        return []


def find_tags_touch(html_url, class_dict):
    try:
        parsed_html = urllib.urlopen(html_url).read()
        soup = BeautifulSoup(parsed_html, 'html.parser')
        tags_list = {}
        serp_items = soup.find_all(attrs={'class': 'serp-item'})
        if len(serp_items) == 0:
            try:
                serp_data = soup.find_all(attrs={'id': 'rso'})[0]
            except:
                serp_data = soup.find_all(attrs={'id': 'main'})[0]
            serp_items = []

            for good_item in find_google_elem(serp_data, 'data-hveid'):
                serp_items.append(good_item)
        for key in class_dict:
            for n, serp_item in enumerate(serp_items):
                if check_class(class_dict[key], serp_item):
                    tags_list[key] = n
                    break
        return tags_list
    except:
        print 'error: {}'.format(html_url)
        return []


def worker_desktop(query):
    row = [
        str(query[1]['sbs_ticket']),
        query[1]['query'],
        query[1]['yandex_html'],
        query[1]['google_html']
    ]
    ya_tags = find_tags_desktop(query[1]['yandex_html'], class_dict_desktop)
    g_tags = find_tags_desktop(query[1]['google_html'], class_dict_desktop)
    for key in class_dict_desktop.keys():
        if key in ya_tags:
            row += [str(ya_tags[key])]
        elif key in g_tags:
            row += [str(g_tags[key])]
        else:
            row += [str(None)]
    return u'\t'.join(row)


def worker_touch(query):
    row = [
        str(query[1]['sbs_ticket']),
        query[1]['query'],
        query[1]['yandex_html'],
        query[1]['google_html']
    ]
    ya_tags = find_tags_touch(query[1]['yandex_html'], class_dict_touch)
    g_tags = find_tags_touch(query[1]['google_html'], class_dict_touch)
    for key in class_dict_touch.keys():
        if key in ya_tags:
            row += [str(ya_tags[key])]
        elif key in g_tags:
            row += [str(g_tags[key])]
        else:
            row += [str(None)]
    return u'\t'.join(row)


def main():
    args = argument_parser()
    df = pd.read_csv(args.input, delimiter="\t", encoding='utf-8')

    p = multiprocessing.Pool(args.pool)
    columns = [u'sbs_ticket', u'query', u'yandex_html', u'google_html'] + [key for key in class_dict_desktop.keys()]

    with codecs.open(args.out_df, "w", "utf-8") as f:
        f.write(u"\t".join(columns) + u"\n")
        if args.platform == 'desktop':
            for result in p.imap(worker_desktop, df.iterrows()):
                f.write(result + u"\n")
        elif args.platform == 'touch':
            for result in p.imap(worker_touch, df.iterrows()):
                f.write(result + u"\n")

    df = pd.read_csv(args.out_df, delimiter='\t')
    df.replace('None', np.nan, inplace=True)
    df.fillna(100, inplace=True)
    for key in class_dict_desktop.keys():
        df[key] = df[key].astype(int)
        df[key + '_oh'] = df[key].apply(lambda x: -1 <= x <= 5)

    wizard_names = ['bno', 'entity', 'video', 'geo', 'images', 'fact']
    gb = df.groupby('query')
    for wizard_name in wizard_names:
        yandex_name = 'yandex_{}_oh'.format(wizard_name)
        google_name = 'google_{}_oh'.format(wizard_name)

        yandex_filtered = gb[yandex_name].any()
        google_filtered = gb[google_name].any()
        any_filtered = yandex_filtered | google_filtered
        both_filtered = yandex_filtered & google_filtered

        filters = [yandex_filtered, google_filtered, any_filtered, both_filtered]
        filter_names = ['only_yandex', 'only_google', 'any', 'both']

        for myfilter, filter_name in zip(filters, filter_names):
            with open('results/{}_{}'.format(filter_name, wizard_name), 'w') as f:
                for item in myfilter.index[myfilter]:
                    print >> f, item


if __name__ == '__main__':
    main()
