"""
    Get production queries form eventlogdata on hahn
"""
import argparse
import re
import sys
import time

from yql.api.v1.client import YqlClient


def sslfix():  # YTADMINREQ-8845
    import certifi
    import requests

    YANDEX_SSL_CERTIFICATE = 'https://crls.yandex.net/YandexInternalRootCA.crt'

    ca_bundle = certifi.where()
    resp = requests.get(YANDEX_SSL_CERTIFICATE)
    with open(ca_bundle, 'a') as ca_file:
        ca_file.write('\n# Yandex Internal Root CA\n')
        ca_file.write(resp.text)


QUERY_MMETA = """
PRAGMA yt.InferSchema;
PRAGMA RegexUseRe2='true';


$is_context_created = ($event_type, $event_data) -> {{
    return
        ($event_type REGEXP "ContextCreated[\\n]{{0,1}}") AND
        NOT($event_data REGEXP ".*&dh=.*") AND
        NOT($event_data REGEXP ".*[&\\?]info=.*") AND
        NOT (String::Contains($event_data, "yandex_tier_search_stage"));
}};

$is_apphost_req = ($event_type) -> {{
    return $event_type REGEXP "AppHostRequest[\\n]{{0,1}}"
}};

$get_context_created = ($event_type, $event_data) -> {{
    return IF ($is_context_created($event_type, $event_data),  String::SplitToList($event_data, "\\t")[0], "")
}};


$get_apphost_req = ($event_type, $event_data) -> {{
    return IF ($is_apphost_req($event_type),  String::SplitToList($event_data, "\\t")[2], "")
}};

SELECT
    MAX_BY($get_apphost_req(event_type, event_data), len($get_apphost_req(event_type, event_data))) as apphost_req,
    MAX_BY($get_context_created(event_type, event_data), len($get_context_created(event_type, event_data))) as cgi
FROM hahn.`//home/eventlogdata/MsuseardataJupiterTier0/{}`
WHERE $is_context_created(event_type, event_data) or $is_apphost_req(event_type)
GROUP BY frame_id
"""

QUERY_ITDITP = """
PRAGMA yt.InferSchema;
PRAGMA RegexUseRe2='true';


$is_context_created = ($event_type, $event_data) -> {{
    return
        ($event_type REGEXP "ContextCreated[\\n]{{0,1}}") AND
        NOT($event_data REGEXP ".*&dh=.*") AND
        NOT($event_data REGEXP ".*[&\\?]info=.*");
}};

$is_apphost_req = ($event_type) -> {{
    return $event_type REGEXP "AppHostRequest[\\n]{{0,1}}"
}};

$get_context_created = ($event_type, $event_data) -> {{
    return IF ($is_context_created($event_type, $event_data),  String::SplitToList($event_data, "\\t")[0], "")
}};


$get_apphost_req = ($event_type, $event_data) -> {{
    return IF ($is_apphost_req($event_type),  String::SplitToList($event_data, "\\t")[2], "")
}};

SELECT
    MAX_BY($get_apphost_req(event_type, event_data), len($get_apphost_req(event_type, event_data))) as apphost_req,
    MAX_BY($get_context_created(event_type, event_data), len($get_context_created(event_type, event_data))) as cgi
FROM hahn.`//home/eventlogdata/MsuseardataJupiterTier0_itditp/{}`
WHERE $is_context_created(event_type, event_data) or $is_apphost_req(event_type)
GROUP BY frame_id
"""

QUERY_INT = """
PRAGMA yt.InferSchema;
PRAGMA RegexUseRe2='true';

SELECT event_data FROM `home/eventlogdata/WebTier0/{}`
WHERE event_type == "ContextCreated" AND NOT(event_data REGEXP ".*&dh=.*") AND NOT(event_data REGEXP ".*[&\?]info=.*")
"""


class QueryFilters(object):
    """
        Filters: True in case of 'desired' condition
    """
    EMPTY_THRESHOLD_PER_CENT = 10
    REQID_RE = re.compile(r"&reqid=([^&\s]*)")
    DB_TS_RE = re.compile(r'&pron=db_timestamp[^&\s]+')

    def __init__(self):
        self.req_ids = set()
        self.not_unique_counter = 0
        self.empty_counter = 0

    def is_reqid_unique(self, query):
        curr_reqid_match = self.REQID_RE.search(query)
        if curr_reqid_match:
            curr_reqid = curr_reqid_match.group()
            if curr_reqid not in self.req_ids:
                self.req_ids.add(curr_reqid)
                return True
            else:
                self.not_unique_counter += 1
        return False

    def is_empty_text(self, query):
        if "&text=" not in query:
            self.empty_counter += 1
            return True
        return False

    def is_empty_text_or_qtree(self, query):
        """SEARCH-3003"""
        if "&text=" not in query or "&qtree=" not in query:
            self.empty_counter += 1
            return True
        return False


def get_int_queries(opts):
    request = _run_yql_query(QUERY_INT, opts)

    correct_queries_counter = 0
    allowed_bad_rows = 10
    bad_rows = 0
    q_filters = QueryFilters()
    with open(opts.cgipath, "w") as f:
        for table in request.get_results():
            _fetch_full_data(table)
            for row in table.rows:
                if row:
                    stripped_row = row[0].strip()
                    if stripped_row:
                        splitted_row = stripped_row.split("\t")
                        if len(splitted_row) < 2:
                            bad_rows += 1
                            print "Bad row: '{}', skip it".format(stripped_row)
                            if bad_rows > allowed_bad_rows:
                                raise RuntimeError("Too many bad rows")
                        else:
                            query = splitted_row[0]
                            if q_filters.is_empty_text(query):
                                print >> sys.stderr, "Empty query:"
                                print >> sys.stderr, query
                                continue
                            if opts.unique and not q_filters.is_reqid_unique(query):
                                continue
                            correct_queries_counter += 1
                            f.write("?" + q_filters.DB_TS_RE.sub('', query) + "\n")

    _finalize_selection(correct_queries_counter, opts.limit, q_filters)


def _run_yql_query(query_template, opts):
    client = YqlClient(db='hahn', token=opts.token)
    lim = '\nLIMIT {};'.format(opts.limit * 2) if opts.limit else ';'
    sb_task = '\n-- Sandbox Task {}'.format(opts.sbtaskid) if opts.sbtaskid else ''
    yql_req = query_template.format(opts.day) + lim + sb_task
    print "Use YQL query:"
    print yql_req
    request = client.query(yql_req, syntax_version=1)
    request.run()
    return request


def get_mmeta_queries(opts):
    request = _run_yql_query(QUERY_MMETA, opts)
    correct_queries_counter = 0
    q_filters = QueryFilters()
    with open(opts.apphostpath, "w") as apphost_f, open(opts.cgipath, "w") as cgi_f:
        for table in request.get_results():
            _fetch_full_data(table)
            for row in table.rows:
                if opts.limit and correct_queries_counter >= opts.limit:
                    break
                if row:
                    apphost_req, cgi_req = row
                    if q_filters.is_empty_text_or_qtree(cgi_req):
                        print >> sys.stderr, "Empty query:"
                        print >> sys.stderr, cgi_req
                        continue
                    if opts.unique and not q_filters.is_reqid_unique(cgi_req):
                        continue
                    correct_queries_counter += 1
                    cgi_f.write("?" + cgi_req + "\n")
                    apphost_f.write(apphost_req.strip() + "\n")

    _finalize_selection(correct_queries_counter, opts.limit, q_filters)


def get_itditp_queries(opts):
    request = _run_yql_query(QUERY_ITDITP, opts)
    correct_queries_counter = 0
    q_filters = QueryFilters()
    with open(opts.apphostpath, "w") as apphost_f, open(opts.cgipath, "w") as cgi_f:
        for table in request.get_results():
            _fetch_full_data(table)
            for row in table.rows:
                if opts.limit and correct_queries_counter >= opts.limit:
                    break
                if row:
                    apphost_req, cgi_req = row
                    if q_filters.is_empty_text(cgi_req):
                        print >> sys.stderr, "Empty query:"
                        print >> sys.stderr, cgi_req
                        continue
                    if opts.unique and not q_filters.is_reqid_unique(cgi_req):
                        continue
                    correct_queries_counter += 1
                    cgi_f.write("?{}\n".format(cgi_req.strip()))
                    apphost_f.write(apphost_req.strip() + "\n")

    _finalize_selection(correct_queries_counter, opts.limit, q_filters)


def _fetch_full_data(table):
    n_try = 0
    for n_try in range(1, 4):
        try:
            table.fetch_full_data()
            break
        except Exception as e:
            print "Failed to fetch full data {} time".format(n_try)
            print e
            time.sleep(20)
    if n_try >= 3:
        raise RuntimeError("Can't fetch full data from hahn")


def _finalize_selection(correct_queries_counter, limit, q_filters):
    print "{} queries dumped".format(correct_queries_counter)
    if correct_queries_counter < limit:
        raise RuntimeError("Not enough queries dumped: {}. Expected: {}".format(correct_queries_counter, limit))
    if q_filters.empty_counter + q_filters.not_unique_counter > 0:
        print "Skip {} not unique queries and {} queries with empty text or qtree".format(
            q_filters.not_unique_counter, q_filters.empty_counter
        )
    empty_rate = 100.0 * q_filters.empty_counter / correct_queries_counter
    if empty_rate > q_filters.EMPTY_THRESHOLD_PER_CENT:
        raise RuntimeError("Too much empty queries: {}%".format(empty_rate))


def main():
    pars = argparse.ArgumentParser(description="Get production queries")
    pars.add_argument("--day", "-d", help="Day of eventlog data")
    pars.add_argument("--mtype", "-m", help="Middlesearch type")
    pars.add_argument("--limit", "-l", type=int, help="Limit of queries (0 - dump all)", default=0)
    pars.add_argument("--cgipath", help="Path to cgi output queries")
    pars.add_argument("--apphostpath", help="Path to apphost output queries")
    pars.add_argument("--token", "-t", help="Token")
    pars.add_argument("--unique", "-u", help="Unique reqids only", action="store_true", default=True)
    pars.add_argument("--sbtaskid", help="Sandbox task id")
    opts = pars.parse_args()

    sslfix()
    if opts.mtype == "int":
        get_int_queries(opts)
    elif opts.mtype == "itditp":
        get_itditp_queries(opts)
    else:
        get_mmeta_queries(opts)


if __name__ == '__main__':
    main()
