import time
from datetime import datetime, timedelta

from absl import app, flags, logging
from absl.flags import FLAGS
import psycopg2
import pandas as pd
import numpy as np

from mgst_data.config import DB_HOST, DB_USER, DB_PASSWORD
from mgst_data.utils import upsert_data, date_validator, query

flags.DEFINE_string('start_date', None, 'Starting Date (2019-01-01)')
flags.DEFINE_string('end_date', None, 'Ending Date (2019-02-01)')

flags.register_validator(
    'start_date', date_validator(), message='invalid date')
flags.register_validator('end_date', date_validator(), message='invalid date')

'''
CREATE TABLE IF NOT EXISTS mgst.marionette_country (
  date timestamp,
  platform VARCHAR(255),
  country VARCHAR(255),
  is_mobile BOOLEAN,
  hours_watched NUMBER
);
'''


def combine_adj(comp_df_30, comp_df, twitch_df):
    print(comp_df_30, comp_df)
    comp_df_30 = comp_df_30.replace(r'^\s*$', np.nan, regex=True)
    comp_df = comp_df.replace(r'^\s*$', np.nan, regex=True)
        
    # find total unknown language and unknown country
    unknown = comp_df[comp_df.language.isnull() & comp_df.country.isnull()].groupby(
    'platform').agg({'hours_watched': 'sum'})

    # additional hours per language based on platform distribution
    language_perc = comp_df_30.groupby(
        ['platform', 'language']).agg({'hours_watched': 'sum'})
    language_perc['perc'] = language_perc.groupby(
        'platform').apply(lambda x: x / x.sum())
    language_perc = language_perc.reset_index('language')

    # calc additional hw per langauge
    add_language = language_perc.merge(
        unknown, on='platform', suffixes=('', '_total'))
    add_language['additional'] = add_language['perc'] * \
        add_language['hours_watched_total']
    add_language = add_language.reset_index()


    # adjusted hw per langauge
    noc = comp_df[~comp_df.language.isnull() & comp_df.country.isnull()]
    noc = noc.merge(add_language[['platform', 'language', 'additional']], how='outer', on=['platform', 'language'])
    noc['hours_watched_adj'] = noc['hours_watched'].fillna(0) + noc['additional'].fillna(0)

    # find langauge to country ratio based on youtube
    yt_df = comp_df_30[comp_df_30.platform == 'youtube'].drop('platform', axis=1).groupby(
    ['language', 'country']).agg({'hours_watched': 'sum'})
    yt_df['lc_perc'] = yt_df.groupby(['language']).apply(lambda x: x / x.sum())
    yt_df = yt_df.reset_index()

    # map unknown country based on youtube ratio
    adj_df = noc.merge(yt_df, how='left', on='language', suffixes=('', '_ytlc'))
    adj_df['hours_watched_adj2'] = adj_df['hours_watched_adj'] * adj_df['lc_perc']
    adj_df['country'] = adj_df['country_ytlc']
    adj_df['hours_watched'] = adj_df['hours_watched_adj2']

    comp_df_adj = pd.concat([
        comp_df[['platform', 'country', 'hours_watched']],
        adj_df[['platform', 'country', 'hours_watched']]])
    
    c_g = comp_df_adj.groupby(['platform', 'country']).agg(
        {'hours_watched': 'sum'}).reset_index()
    t_g = twitch_df.groupby(['platform', 'country']).agg(
        {'hours_watched': 'sum'}).reset_index()

    # combine twitch and comp
    all_df = pd.concat([c_g, t_g], ignore_index=True)
    all_df = all_df.sort_values(
        ['country', 'hours_watched'], ascending=[True, False])
    all_df['hw_rank'] = all_df.groupby(['country']).cumcount() + 1

    return all_df


def main(_argv):
    start_time = time.time()
    logging.info("Processing channels from %s to %s",
                 FLAGS.start_date, FLAGS.end_date)

    con = psycopg2.connect(dbname='product', host=DB_HOST,
                           port='5439', user=DB_USER, password=DB_PASSWORD)

    start_date = datetime.strptime(FLAGS.start_date, "%Y-%m-%d")
    last_30 = datetime.strftime(start_date - timedelta(days=30), "%Y-%m-%d")

    comp_df = pd.read_sql(f"""
    SELECT 
      tm.country_code AS country, 
      tm.language AS language, 
      m.platform AS platform, 
      nvl(game, '') in (
        SELECT 
          NAME 
        FROM 
          mgst.mobile_games
      ) as is_mobile, 
      sum(minutes_watched_adjusted) / 60.0 AS hours_watched 
    FROM 
      cubes.hourly_comp_intel_by_channel_video_game_lang_country AS m 
      LEFT JOIN mgst.marionette_metadata_prod tm ON (m.channel_id = tm.channel_id) 
    WHERE 
      day >= %(start_date)s 
      AND day < %(end_date)s 
    GROUP BY 
      1, 
      2, 
      3,
      4
    """, con, params={'start_date':  last_30, 'end_date': FLAGS.end_date})
    comp_df_1 = pd.read_sql(f"""
    SELECT 
      tm.country_code AS country, 
      tm.language AS language, 
      m.platform AS platform, 
      nvl(game, '') in (
        SELECT 
          NAME 
        FROM 
          mgst.mobile_games
      ) as is_mobile, 
      sum(minutes_watched_adjusted) / 60.0 AS hours_watched 
    FROM 
      cubes.hourly_comp_intel_by_channel_video_game_lang_country AS m 
      LEFT JOIN mgst.marionette_metadata_prod tm ON (m.channel_id = tm.channel_id) 
    WHERE 
      day >= %(start_date)s 
      AND day < %(end_date)s 
    GROUP BY 
      1, 
      2, 
      3,
      4
    """, con, params={'start_date':  FLAGS.start_date, 'end_date': FLAGS.end_date})
    twitch_df_1 = pd.read_sql(f"""
    SELECT 
      UPPER(
        primary_broadcast_country_past_30_days
      ) AS country, 
      'twitch' AS platform, 
      nvl(primary_broadcast_game, '') in (
        SELECT 
          NAME 
        FROM 
          mgst.mobile_games
      ) as is_mobile, 
      sum(live_minutes_watched_total) / 60.0 AS hours_watched 
    FROM 
      cubes.creator_daily_channel_summary_only_days_with_activity t1 
    WHERE 
      day >= %(start_date)s 
      AND day < %(end_date)s 
    GROUP BY 
      1, 
      2, 
      3
    """, con, params={'start_date':  FLAGS.start_date, 'end_date': FLAGS.end_date})

    mobile_df = combine_adj(
        comp_df[comp_df.is_mobile],
        comp_df_1[comp_df_1.is_mobile],
        twitch_df_1[twitch_df_1.is_mobile])
    print(mobile_df)

    data = []
    for row in mobile_df.itertuples():
        data.append([FLAGS.start_date, row.platform, row.country,
                     True, row.hours_watched, row.hw_rank])
    upsert_data(con,
                'mgst.marionette_country',
                ['date', 'platform', 'country', 'is_mobile', 'hours_watched', 'hw_rank'],
                ['date', 'platform', 'is_mobile'],
                data)

    non_mobile_df = combine_adj(
        comp_df[~comp_df.is_mobile],
        comp_df_1[~comp_df_1.is_mobile],
        twitch_df_1[~twitch_df_1.is_mobile])
    print(non_mobile_df)

    data = []
    for row in non_mobile_df.itertuples():
        data.append([FLAGS.start_date, row.platform, row.country,
                     False, row.hours_watched, row.hw_rank])
    upsert_data(con,
                'mgst.marionette_country',
                ['date', 'platform', 'country', 'is_mobile', 'hours_watched', 'hw_rank'],
                ['date', 'platform', 'is_mobile'],
                data)

    logging.info("Done %ds", time.time() - start_time)


if __name__ == '__main__':
    app.run(main)
