import argparse
import json
import logging
import datetime
import sys

import yt.wrapper as yt

__author__ = 'irlab'


class BeastSample:
    DATE_FORMAT = '%Y%m%d'
    BEAST_TABLE_TEMPLATE = '//home/freshness/beast/{date}/web_touch/google/json'
    BEAST_FILTER_COUNTRY = 'ru'

    def __init__(self, do_recalculate, out_html_table):
        self.do_recalculate = do_recalculate
        self.out_html_table = out_html_table

    def beast_extract_mapper(self, rec):
        if rec['country'] != self.BEAST_FILTER_COUNTRY:
            return
        query = rec['query']
        answer = json.loads(rec['answer'])
        html = answer['serp-page']['serp-resources']['resources'][0]['content']
        html = html.encode('utf8')
        out_rec = dict(
            query=query,
            timestamp=str(rec['timestamp']),
            html=html
        )
        yield out_rec

    def html_sample_reducer(self, key, recs):
        for rec in recs:
            yield dict(key=rec['query'], subkey=rec['timestamp'], value=rec['html'])
            break

    def get_dates_to_process(self, last_date, days_back):
        dates = []
        d = datetime.datetime.strptime(last_date, self.DATE_FORMAT)
        for di in range(days_back):
            dates.append(d.strftime(self.DATE_FORMAT))
            d -= datetime.timedelta(days=1)
        return dates

    def run_filter_beast(self, beast_table_list, out_html_table):
        yt.create_table(out_html_table, recursive=True, ignore_existing=True)
        yt.run_map_reduce(
            mapper=self.beast_extract_mapper,
            reducer=self.html_sample_reducer,
            reduce_by=['query'],
            source_table=beast_table_list,
            destination_table=out_html_table,
            spec=dict(title=self.__class__.__name__)
        )

    def main(self, last_date, days_back):
        dates_to_process = self.get_dates_to_process(last_date, days_back)
        beast_table_list = []
        for date in sorted(dates_to_process):
            beast_table = self.BEAST_TABLE_TEMPLATE.format(date=date)
            if yt.exists(beast_table):
                beast_table_list.append(beast_table)
                print 'use input table', beast_table
            else:
                print 'table does not exists', beast_table
        if not beast_table_list:
            raise Exception('beast_table_list is empty, check ' + self.BEAST_TABLE_TEMPLATE)

        if not yt.exists(self.out_html_table) or self.do_recalculate:
            print 'getting html ' + repr(beast_table_list) + '->' + self.out_html_table
            self.run_filter_beast(beast_table_list, out_html_table)

def get_yesterday():
    d = datetime.datetime.now()
    d -= datetime.timedelta(days=1)
    return d.strftime(BeastSample.DATE_FORMAT)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='get html for unique queries from beast tables')
    parser.add_argument('--days_back', type=int, default=1, help='number of days to process, default=1')
    parser.add_argument('--date', default=get_yesterday(), help='last day of the period, default=yesterday')
    parser.add_argument('--out_html_table', default='//tmp/search-functionality/serp_anatomy/{date}_html', help='output table')
    parser.add_argument("--do_recalculate", default=False, action='store_true', help="do not skip existing tables")
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s\t%(levelname)s\t%(threadName)s\t%(msg)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        stream=sys.stdout,
    )

    out_html_table = args.out_html_table.format(date=args.date)
    BeastSample(args.do_recalculate, out_html_table).main(args.date, args.days_back)
