"""
// This Python snippet transforms the following schema:
  //   fr             string        user:145671775
  //   to             string        user:49717001
  //   et             string        follows
  //   db             string        { "block_notifications" : { "BOOL" : true }}
  //   ca             float         1553561495803116957
  //   ua             float         1553561495803116957
  // To:
  //   target_user_id string        49717001
  //   from_user_id   string        145671775
  //   action         string        follows
  //   notifs_on      boolean       true
  //   followed_on    timestamp     2015-11-14T02:53:45.000
  //   updated_on     timestamp     2015-11-14T02:53:45.000
"""

from pyspark.sql import types as T
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import when, from_json

"""
for some reason spark interprets {"block_notifications":{"BOOL":true}}
as {"block_notifications":BooleanType} and not {"block_notifications": {"BOOL": BooleanType}}
so the schema may look wrong, but it resolves correctly.
"""
json_schema = T.StructType(
    [
        T.StructField('block_notifications', T.BooleanType())
    ]
)
# [5:100] will grab the substring after "user:". 100 is an arbitrary upper bound, but spark requires it
df = df.withColumn('target_user_id', when(F.col('to').startswith('user:'), F.col('to')[6:100].cast('string')).otherwise(None))
df = df.withColumn('from_user_id', when(F.col('fr').startswith('user:'), F.col('fr')[6:100].cast('string')).otherwise(None))
df = df.withColumn('action', F.col('et'))
# ~ == not (to convert to bool)
df = df.withColumn('notifs_on', ~from_json(F.col('db'), json_schema).block_notifications)

# Truncate timestamp from nanoseconds to seconds.
# Python doesn't cleanly convert nanosecond timestamps.
df = df.withColumn('followed_on', F.to_utc_timestamp((F.col('ca')/1000000000).cast('timestamp'), "GMT"))
df = df.withColumn('updated_on', F.to_utc_timestamp((F.col('ua')/1000000000).cast('timestamp'), "GMT"))
return df