conf = spark.sparkContext._conf.setAll([
    ('hive.metastore.uris', 'thrift://tahoe-metastore.di.twitch.a2z.com:9084'),
    ('spark.sql.hive.convertMetastoreParquet', False),
    ('spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive', True),
    ('hive.mapred.supports.subdirectories', True),
    ('spark.sql.sources.partitionOverwriteMode', 'dynamic'),
    ('spark.speculation', True),
    ('spark.speculation.multiplier', 2),
])

spark.sparkContext.stop()
spark = SparkSession.builder.enableHiveSupport().config(conf=conf).getOrCreate()

from datetime import datetime
year = 2019
for month in range(12):
    start_date = datetime(year=year + month // 12, month=month % 12 + 1, day=1).strftime("%Y-%m-%d")
    end_date = datetime(year=year + (month+1) // 12, month=(month+1) % 12 + 1, day=1).strftime("%Y-%m-%d")
    print(start_date, end_date)
    df = spark.sql(f"""
    select
        user_id_v0 as user_id,
        channel_id_v1 as channel_id,
        game_name_v1 as game_name,
        SUM(hours_watched_v1) as hours_watched
    from sheik.fact_viewer_activity_daily_v2 
    where day_v0 >= '{start_date}' and day_v0 < '{end_date}'
    group by 1,2,3
    """).write.parquet(f's3://rudolph-spark/user_hw_preagg/year={year}/month={month+1}/')


df = spark.read.parquet('s3://rudolph-spark/user_hw_preagg').persist().registerTempTable("hw_preagg")

top_channel = spark.sql("""
select 
    *,
    ROW_NUMBER() OVER (
        PARTITION BY user_id
        ORDER BY
            hours_watched DESC
    ) AS rank
FROM (
    select
        user_id as user_id,
        channel_id as channel_id,
        SUM(hours_watched) as hours_watched
    from hw_preagg
    GROUP BY 1,2
)
""")
top_channel.write.parquet('s3://rudolph-spark/user_top_channel')

top_game = spark.sql("""
select 
    *,
    ROW_NUMBER() OVER (
        PARTITION BY user_id
        ORDER BY
            hours_watched DESC
    ) AS rank
FROM (
    select
        user_id as user_id,
        game_name as game_name,
        SUM(hours_watched) as hours_watched
    from hw_preagg
    GROUP BY 1,2
)
""")
top_game.write.parquet('s3://rudolph-spark/user_top_game')

hw_total = spark.sql("""
SELECT
    user_id as user_id,
    SUM(hours_watched) as hours_watched
FROM hw_preagg
GROUP BY 1
""")
hw_total.write.parquet('s3://rudolph-spark/user_hw_total')
