# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from sql import ifelse, roundTimeToSec, arrayFilter, arrayExists
from dicts import STOPWORDS

ROUNDED_TIME = 5

SERP_COUNTER = 731962   # Yandex SERP Counter
SERP = 'SERP'
SERP_QUERY_PARAM = 'text'   # Yandex search phrase urlparam
EXTRACT_QUERY = \
    ifelse(
        'not empty(SearchPhrase)',
        'SearchPhrase',
        ifelse(
            'CounterID=%s' % SERP_COUNTER,
            "extractURLParameter(URL, '%s')" % SERP_QUERY_PARAM,
            "''"  # later it will be possible to define more complex rules
            )
        )


EXTRACT_URLDOMAIN = \
    ifelse(
        'CounterID=%s' % SERP_COUNTER,
        "'%s'" % SERP,
        "URLDomain"
    )

FILTERED_ARRAY = ''


IS_GOOD_HIT = """
    NOT IsRobot
AND
    NOT DontCountHits
AND
    NOT empty(SearchPhrase)
"""

PREPARED_HITS_ALL = """
SELECT DISTINCT
    UserID,
    EventDate,
    {extracted_urldomain} AS URLDomain,
    {extracted_query} AS SearchPhrase,
    {rounded_time} AS EventTime
FROM
    hits_all
SAMPLE {{sample}}
WHERE
    EventDate = toDate('{{date}}')
    AND
    {is_good_hit}
    {{searchids}}
    {{custom}}
""".format(
        is_good_hit=IS_GOOD_HIT,
        extracted_query=EXTRACT_QUERY,
        extracted_urldomain=EXTRACT_URLDOMAIN,
        rounded_time=roundTimeToSec(ROUNDED_TIME)
)

# Query with main logic of getting some king of sessions
MAIN_HITS = """
(
    SELECT
        UserID,
        {{abstract_phrase}} AS APhrase1,
        SearchPhrase,
        URLDomain,
        EventTime
    FROM
        ({prepared_hits_all})
) GLOBAL ALL INNER JOIN
(
    SELECT
        UserID,
        any(EventTime0) as EventTime0,
        any(APhrase) as APhrase,
        any(SearchPhrase) as SearchPhrase
    FROM
    (
        SELECT
            UserID,
            EventTime AS EventTime0,
            {{abstract_phrase}} AS APhrase,
            SearchPhrase,
            intDiv(toUInt32(EventTime), (20 * {{delta}}) / 2) * ((20 * {{delta}}) / 2) AS delta_id
        FROM
            ({prepared_hits_all})
        WHERE
            NOT empty(SearchPhrase)
            {{accept_conditions}}
            {{reject_conditions}}
    )
    GROUP BY
        UserID,
        delta_id
) USING UserID
""".format(prepared_hits_all=PREPARED_HITS_ALL)


def non_duplicate_indicator(*fields):
    template = 'arrayMap(x->x=1,arrayEnumerateUniq({arrs}))'
    arrs = ','.join(fields)
    return template.format(arrs=arrs)


def clean_array(field):
    """if user visits serp, keep his visit if he did
    not visit another host with the same search_phrase
    :param field: {Phrases, RawPhrases, Hosts, Times}
    :return:
    """
    not_serp_phrases = arrayFilter("(po, ho) -> ho!='{serp}'".format(serp=SERP),
                                       "allPhrases", "allHosts")
    is_in_non_serp_set = arrayExists('po -> po=p', not_serp_phrases)
    filt = arrayFilter(
            # function
            "(x, p, h)-> h != '{serp}' or not {is_in_non_serp_set}".format(
                    serp=SERP, is_in_non_serp_set=is_in_non_serp_set),
            # arguments
            field,
            'allPhrases',
            'allHosts'
    )
    return filt


# Then we group them, but in many cases there are non unique records because "session" center is in EventTime0.
# We add a key LastTime that allowes to compare two "sessions":
#   if UserID and LastTime are equal(EventTime is probably not equal) then it is the same session
NON_UNIQUE_FILTERED = """
SELECT
    UserID,
    groupArray(APhrase1)      AS Phrases,
    groupArray(SearchPhrase)  AS RawPhrases,
    groupArray(URLDomain)     AS Hosts,
    groupArray(EventTime)     AS Times,
    max(EventTime)            AS LastTime,
    min(EventTime)            AS FirstTime
FROM
{main}
WHERE
    abs(EventTime0 - EventTime) < (60 * {{delta}})
    AND
    (NOT empty(APhrase1) OR NOT empty(SearchPhrase))
GROUP BY
    UserID,
    EventTime0
{{having_post_reject}}
"""

# We can perform selecting unique "sessions" using `any` using keys UserID and LastTime
# I found that it is still not perfect solution but in most cases duplicated sessions are dropped
# will be formatted with subquery from visits or hits
UNIQUE_FILTERED_NOT_CLEANED = """
SELECT
    UserID,
    any(Phrases)        AS allPhrases,
    any(Hosts)          AS allHosts,
    any(Times)          AS allTimes,
    toDate('{{date}}')  AS Date,
    any(LastTime)       AS LastTime,
    FirstTime
FROM
(
    SELECT
            UserID,
            any(Phrases)        AS Phrases,
            any(Hosts)          AS Hosts,
            any(Times)          AS Times,
            toDate('{{date}}')  AS Date,
            LastTime,
            any(FirstTime)      AS FirstTime
        FROM ({non_unique})
        GROUP BY
            UserID,
            LastTime,
            Date
)
GROUP BY
    UserID,
    FirstTime,
    Date

"""

UNIQUE_FILTERED_CLEANED = """
SELECT
    UserID,
    {clean_phrases}     AS Phrases,
    {clean_hosts}       AS Hosts,
    {clean_times}       AS Times,
    toDate('{{date}}')  AS Date,
    LastTime            AS LastTime,
    FirstTime
FROM
  ({unique_filtered_not_cleaned})
"""


HITS = UNIQUE_FILTERED_CLEANED.format(
    unique_filtered_not_cleaned=UNIQUE_FILTERED_NOT_CLEANED.format(
            non_unique=NON_UNIQUE_FILTERED.format(
                    main=MAIN_HITS
            )),
    clean_phrases=clean_array('allPhrases'),
    clean_hosts=clean_array('allHosts'),
    clean_times=clean_array('allTimes')
)
DUMMY = """
SELECT
    toUInt64(0) as UserID,
    [''] as Phrases,
    [''] as Hosts,
    [toDate('1971-01-01')] as Times,
    toDate('1971-01-01') as Date,
    toDate('1971-01-01') as LastTime,
    toDate('1971-01-01') as FirstTime"""

COUNT_FILTERED = """
SELECT
    count()
FROM
(
    SELECT DISTINCT
        Times,
        UserID,
        Hosts,
        Phrases
    FROM {tmp}
    ARRAY JOIN
        Times,
        Hosts,
        Phrases
)
"""

STATS = """
SELECT
    FilteredUniques + NotFilteredUniques                   AS AllUniques,
    (1/{{sample}})*uniqIf(UserID, IsFiltered)              AS FilteredUniques,
    (1/{{sample}})*uniqIf(UserID, NOT IsFiltered)          AS NotFilteredUniques,

    (1/{{sample}})*countIf(IsFiltered)                     AS HitsAtFiltered,
    (1/{{sample}})*({count_filtered_hits})                 AS FilteredHits,
    (1/{{sample}})*countIf(NOT IsFiltered)                 AS HitsAtNotFiltered,
    HitsAtFiltered + HitsAtNotFiltered                     AS AllHits,

    (FilteredHits*{{sample}}) /
    (HitsAtFiltered*{{sample}} + 1)                        AS FilteredHitsDivHitsAtFiltered,

    (FilteredHits*{{sample}}) /
    (
      HitsAtFiltered   *{{sample}} +
      HitsAtNotFiltered*{{sample}}
    )                                                      AS FilteredHitsFraction,

    (FilteredUniques*{{sample}}) /
    (
      FilteredUniques   *{{sample}} +
      NotFilteredUniques*{{sample}}
    )                                                      AS FilteredUniquesFraction,
    (HitsAtFiltered*{{sample}})/
    (AllHits   *{{sample}})                                AS HitsAtFilteredFraction,
    toDate('{{date}}')                                     AS Date
FROM
(
    SELECT
        UserID GLOBAL IN (SELECT UserID FROM {{tmp}}) as IsFiltered,
        UserID
    FROM
        ({prepared_hits_all})
    WHERE
    NOT empty(SearchPhrase)
)
""".format(
    count_filtered_hits=COUNT_FILTERED,
    prepared_hits_all=PREPARED_HITS_ALL
)

WORD_COUNT = """
SELECT
    Word,
    count() AS count
FROM
(
    SELECT
        words AS Word
    FROM
    (
        SELECT
            splitByChar(' ', Phrases) AS words
        FROM
        (
            SELECT DISTINCT
                UserID,
                Phrases,
                LastTime
            FROM ({{subquery}})
            ARRAY JOIN
                Phrases
        )
    )
    ARRAY JOIN words
)
WHERE
    Word NOT IN ({stop_words})
GROUP BY
    Word
ORDER BY
    count DESC
""".format(stop_words="'" + "','".join(STOPWORDS) + "'")

HOST_COUNT = """
SELECT
    Host,
    count() AS count
FROM
(
    SELECT
        DISTINCT
        UserID,
        Hosts AS Host,
        LastTime
    FROM ({subquery})
    ARRAY JOIN
        Hosts
)
GROUP BY
    Host
ORDER BY
    count DESC
"""

QUERY_COUNT = """
SELECT
    Query,
    count() AS count
FROM
(
    SELECT DISTINCT
        UserID,
        Phrases AS Query,
        LastTime
    FROM ({subquery})
    ARRAY JOIN
        Phrases
)
GROUP BY
    Query
ORDER BY
    count DESC
"""

QUERY_HOST_COUNT = """
SELECT
    Query,
    Host,
    count() AS count
FROM
(
    SELECT DISTINCT
        UserID,
        Phrases AS Query,
        Hosts AS Host,
        LastTime
    FROM ({subquery})
    ARRAY JOIN
        Phrases,
        Hosts
)
GROUP BY
    Query, Host
ORDER BY
    count DESC
"""

TOTAL = """
SELECT
    sum(count)
FROM
    ({subquery})
"""

ADD_PERCENT = """
SELECT *,
    count/({total}) as fraction
FROM
    ({{subquery}})
""".format(total=TOTAL)


def with_percent(query):
    return ADD_PERCENT.format(subquery=query)

