variable "service" {
  description = "The name of the service this module is being deployed as part of.  Used in tags to identify resources when this module is deployed into monolithic AWS accounts."
  default     = ""
}

variable "env" {
  description = "The environment this module is being deployed as part of.  Used in tags to identify resources when this module is deployed into monolithic AWS accounts."
  default     = "production"
}

variable "database_type" {
  description = "Type of database to unload from, 'rds', 'dynamodb', or 'redshift'"
}

variable "aurora" {
  description = "For RDS, if True, use Aurora API functions. `skip_snapshot` must also be True."
  default     = "False"
}

variable "skip_snapshot" {
  description = "For RDS, if True, don't spin up a snapshot. Just query the given cluster. The read throughput can be heavy, so only do this on read replicas."
  default     = "False"
}

variable "use_latest_snapshot" {
  description = "For RDS, if True, restore from latest db snapshot for the given cluster instead of point-in-time. Significantly faster than using point-in-time recovery for large databases."
  default     = "False"
}

variable "job_name" {
  description = "Name used for all resources created by Terraform. Required to be lowercase."
}

variable "cluster_name" {
  description = "RDS cluster ID"
}

variable "tahoe_producer_name" {
  description = "Name of the application already registered with the Tahoe API"
  default     = ""
}

variable "tahoe_producer_role_arn" {
  description = "The ARN for the role returned when registering your producer with the Tahoe API"
  default     = ""
}

variable "tahoe_max_concurrent_imports" {
  description = "Controls how many Tahoe imports can be executed in parallel"
  default     = "2"
}

variable "table_config" {
  description = <<EOF
Mapping of table names to a string containing a dictionary of table configuration.
Current keys:
  "schema": list of fields you want to export with properties:
    "name": Required name of field in database.
    "type": Required type of field in database. Options: bigint, string, float, boolean, timestamp, int, multi, struct.
        On Dynamo, structured fields should be imported as "string" if you aren't using the spark_optimization.
        You can have cleaning code that loads those strings (via ast.literal_eval) and extracts specific values.
        If you are using the spark_optimization, you can import them as "struct". See the Dynamo example
        for how to extract a value.
        Also for Dynamo, you should specify all integers as "bigint".
        If you have a Dynamo property with multiple types, export it as "multi" to get string
        representations of the various types.
        In "multi" fields, Dyanmo Set and Binary types are not supported due to a Glue limitation.
    "output_type": Optional type for the output. Useful if you convert types in the cleaning code
        (e.g. converting a Dynamo epoch time to a timestamp).
    "is_derived": Optional boolean to specify if the field is derived from cleaning code. Defaults to False.
    "sensitivity": Optional sensitivity of sensitive identifiers. If you have a new type
        of sensitive identifier or you want to export or have any questions about sensitivity
        classification, please contact #data-infrastructure and security.
        Current allowed values (casing of values is ignored):
          "none" (or no "sensitivity" key) for non-sensitive data
          "userid" for user ID/name fields (if it's a table for broadcaster info, please contact DI)
          "ip" for IP fields
          Other types on https://data.xarth.tv/datasensitivity/overview.html#what-is-sensitivity
  "version": number which should be incremented every time the schema of the table changes (Minimum: 0)
  "tahoe_view_name": Optional name of tahoe view (Defaults to table name with hyphens replaced with underscores). This should omit the `dbsnapshots.` prefix
  "output_fields": list of output field names (Optional. defaults to list of fields in schema)
  "dpu_count": number of Glue workers to use when dumping this table. Deprecated in favor of worker_count (Default: 10. Minimum: 1)
  "worker_count": number of Glue workers to use when dumping this table. See the README (Default: 10. Minimum: 1)
  "hashexpression": SQL expression to partition database load. The expression should generate a
      numeric value that is well-distributed across the data (it can just be a numeric
      column's name, e.g. "id"). Mutually exclusive with hashfield. See https://docs.aws.amazon.com/glue/latest/dg/run-jdbc-parallel-read-job.html
  "hashfield": Field used to partition database load. Should be set to a non-numeric field that is
      well-distributed across the data (e.g. "uuid"). Mutually exclusive with hashexpression.
      See https://docs.aws.amazon.com/glue/latest/dg/run-jdbc-parallel-read-job.html
  "hashpartitions": Number of DB connections to open when reading this table. See the README. (Default: 7). Does
      nothing without "hashexpression" or "hashfield". See https://docs.aws.amazon.com/glue/latest/dg/run-jdbc-parallel-read-job.html
  "dynamodb_splits_count": Number of tasks used to copy DynamoDB to S3. Overrides top-level variable. See README
      (Default: 1. Maximum: 1000000).
  "read_ratio": Required for DynamoDB tables, the read ratio to use. See README (e.g. "0.2")
  "database": Database on the cluster the table lives in. Can leave out if it's the cluster default.
  "namespace": Namespace (aka schema) the table lives in. Can leave out if it's "public" or the cluster default.
  "max_attempts": Maximum number of tries to import this table (Default: 3)
  "output_format": Format to write the data in. Don't change this unless data-infra tells you to.
      Tahoe imports are only supported for parquet. (Default: parquet)
  "custom_view_sql_def": Custom SELECT SQL statement for the Tahoe view definition (Default: SELECTs all columns)
  "worker_type": One of `Standard`, `G.1X`, or 'G.2X' (Default: `Standard`)
  "spark_optimization": Boolean that denotes whether or not to run native spark code for transformation to decrease run time.
    This setting can be set on a per-data source basis.
    If you are using this option, it is highly recommended to use "spark_cleaning_code" instead of
    "cleaning_code" for performance reasons. Additionally, with this option, if you do use
    "cleaning_code" and then use "Map.Apply", you will have issues with microsecond values being
    incorrect on timestamps due to a Glue bug.
    This option will be permanent in the future after more thorough testing.
See the README for some examples.
EOF
  type        = map
}

variable "cleaning_code" {
  description = <<EOF
Optional Python code to clean data before export. See https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-python.html
A DynamicFrame "frame" is available, and you should return the cleaned DynamicFrame.
If you are using spark_optimization, you almost definitely want to do "spark_cleaning_code" instead
of this because the code will be much more performant.
It's Python, so be careful about indentation. The table name currently being operated on is available
as "table_name" to change behavior based on that. Also, your code is in the script's global namespace,
so be careful with variable names.
Example:
  cleaning_code = <<CODE
if table_name == 'some_table':
  def reassign(rec):
    if rec['id'] == 'bad_value':
      rec['id'] = 'fixed_value'
    return rec
  return Map.apply(frame=frame, f=reassign)
else:
  return frame
CODE

Some gotchas collected from previous problems:

  - Make sure the returned frame has all fields defined at a minimum with a None value.
    For example, due to the schema-less nature of DynamoDB missing fields can happen there. Derived
    columns are another potential cause.
  - String fields with null bytes result in data truncation on Tahoe. If cleaning up the source is
    not possible, then make sure to clean them up here. For example, you could use
    "rec['field'] = rec['field'].replace('\x00', '')" to replace null bytes with empty strings.
  - Derived timestamp fields must be emitted as Python datetime objects. If they have sub-second
    resolution, any fractional second less than one tenth of a second will be misrepresented
    (leading zeroes in the decimal representation are removed). This is due to a Glue bug in
    Map.apply.

EOF
  default     = "return frame"
}

variable "spark_cleaning_code" {
  description = <<EOF
Optional Python code to clean data before export. Unlike the above "cleaning_code" variable,
this is given a Spark DataFrame "df" instead of a Glue DynamicFrame "frame".
See https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html for more on DataFrames.

If you are using spark_optimization, you almost definitely want to do this and not do the above
"cleaning_code" since this code will be much more performant.

Like the above cleaning_code, it's Python, so be careful about indentation.
The table name currently being operated on is available
as "table_name" to change behavior based on that. Also, your code is in the script's global namespace,
so be careful with variable names.

You will get the best performance if you use native Spark functions (pyspark.sql.functions is imported
as F). Example code that converts empty or whitespace strings to null (this is already done by default):

  spark_cleaning_code = <<CODE
str_col = 'someColumn'
convert_blank_strings_to_null_func = (F.when((F.ltrim(F.col(str_col)) == ''), None)
    .otherwise(F.col(str_col)))
resolved_choice_df = spark_cleaned_df.withColumn(str_col, convert_blank_strings_to_null_func)
CODE

If Spark functions don't meet your needs, you can create a Python user-defined function (udf)
to run aribtrary code on a given column, e.g. adding the string "x" to "someColumn":

  spark_cleaning_code = <<CODE
@udf('string')
def add_x(s):
  return (s or '') + 'x'
return df.withColumn("someColumn", add_x("someColumn"))
CODE

Some gotchas collected from previous problems:

  - Make sure the returned frame has all fields defined at a minimum with a None value.
    For example, due to the schema-less nature of DynamoDB missing fields can happen there. Derived
    columns are another potential cause.
  - String fields with null bytes result in data truncation on Tahoe. If cleaning up the source is
    not possible, then make sure to clean them up here. For example, you could use
    "rec['field'] = rec['field'].replace('\x00', '')" to replace null bytes with empty strings.
EOF

  default = "return df"
}

variable "trigger_schedule" {
  description = "Cron schedule for running the export job. Defaults to 8 AM UTC (midnight or 1 AM PT depending on DST). See https://docs.aws.amazon.com/glue/latest/dg/monitor-data-warehouse-schedule.html"
  default     = "0 8 * * ? *"
}

variable "s3_output_bucket" {
  description = "S3 bucket to write data to e.g. \"<team-name>-db-exports\""
}

variable "create_s3_output_bucket" {
  description = <<EOF
Set to 0 if you would like to use a pre-existing bucket.
If you do, make sure to tweak the existing bucket's policy
to mimic the "tahoe-api-access" policy defined in glue.tf.
EOF
  default     = 1
}

variable "s3_output_key" {
  description = "S3 key to write data to e.g. \"<database-name>\""
}

variable "s3_output_kms_key_arn" {
  description = <<EOF
Optional ARN of existing KMS key used to encrypt s3 output data.
If you do not specify this, one will be created for you automatically.
If you do specify this, make sure to mimic the key policy below.
EOF
  default     = ""
}

variable "create_s3_script_bucket" {
  description = "Set to 1 if you would like the s3 script bucket to be automatically created."
  default     = 0
}

variable "s3_script_bucket" {
  description = "Bucket to store Glue s3 scripts to."
}

locals {
  # Normalized bucket reference for script bucket, ensuring a created bucket is in the resource graph for usages of its name.
  # Otherwise it may not be created before Terraform attempts to use references to it.
  computed_s3_script_bucket = var.create_s3_script_bucket == 1 ? aws_s3_bucket.glue-scripts[0].bucket : var.s3_script_bucket
}

variable "error_sns_topic_name" {
  description = <<EOF
Name of SNS Topic to send error messages to. The topic must have this in its policy statements
with the correct account ID and topic name:
{
  "Sid": "TrustCWEToPublishEventsToMyTopic",
  "Effect": "Allow",
  "Principal": {
    "Service": "events.amazonaws.com"
  },
  "Action": "sns:Publish",
  "Resource": "arn:aws:sns:us-west-2:<account-id>:<topic-name>"
}
EOF
}

variable "account_number" {
  description = "AWS account number we are running in."
}

variable "vpc_id" {
  description = "VPC to run glue in. Not required for Dynamo DB."
}

variable "subnet_id" {
  description = "VPC subnet to run glue in. Must have have a NAT or s3 VPC endpoint, be in above VPC, and be able to reach DB. Not required for Dynamo DB."
}

variable "availability_zone" {
  description = "Availability Zone of above subnet. Not required for Dynamo DB."
}

variable "rds_subnet_group" {
  description = "Name of RDS subnet group to launch snapshot cluster in. Not required for DynamoDB or with skip_snapshot."
}

variable "rds_instance_class_override" {
  description = "Set this to have a different instance class than your master db when this process restores the snapshot, i.e. db.m4.large."
  default     = ""
}

variable "dynamodb_splits_count" {
  description = "Deprecated - number of tasks used to copy DynamoDB to S3. Valid from 1 to 1000000, inclusive."
  default     = 1
}

variable "cluster_username" {
  description = "Username to connect to above RDS cluster. Make sure it has read-only privileges on the necessary tables. Not required for DynamoDB"
}

variable "db_password_parameter_name" {
  description = "Parameter name of parameter store secret containing DB password. Not required for Dynamo DB"
}

variable "db_password_key_id" {
  description = "ID of KMS key ID used to encrypt DB password e.g. 6043ca4d-151e-4efb-98da-42d2b661d9fb. Not required for Dynamo DB"
}

variable "api_key_parameter_name" {
  description = "Parameter name of parameter store secret containing the producer API key."
}

variable "api_key_kms_key_id" {
  description = "ID of KMS key ID used to encrypt the producer API key e.g. 6043ca4d-151e-4efb-98da-42d2b661d9fb."
}

variable "max_concurrent_runs" {
  description = "Maximum number of tables to unload at the same time. Default AWS limit is 3 (Glue - Number of concurrent job runs per job). Can be increased."
  default     = 3
}

variable "fail_on_error" {
  description = "Denotes whether or not to fail the table load job if there are any errors in the transformation process. NOTE: will be set to true as default in the near future."
  default     = "0"
}

variable "qa_for_job_name" {
  description = <<EOF
Set this to the name of a production job to create a QA job for that job.
This job will load data into a qa_dbsnapshots view instead of dbsnapshots,
a separate underlying Tahoe table, and will not interfere with production jobs
(besides generating more load on the database when not using an RDS snapshot).
The job definition should match the production job, with these caveats:
1. The job_name must be different, e.g. by appending `-qa` to the production job name.
2. This job must reuse the s3_output_bucket and s3_output_kms_key_arn either specified for or created by
   the production job (if the KMS key was generated, it is available as the "s3_kms_key" Terraform output
   from the production job). These must be the same as production to allow the existing
   Tahoe producer to work.
3. The s3_output_key must match production. The actually-used key prefix will have "/qa" appended
   to keep data separate.
4. These jobs will not be scheduled and must be run manually. If you are not using RDS snapshots,
   be careful about running them at the same time as your production jobs in case of excessive
   DB utilization.
5. Though the Tahoe tables are separate from production, the same caveats around versioning apply.
   If you do change the schema, you must increment the version. An additional caveat is that if you
   don't end up applying the QA'd changes but had incremented the version, that version(s) will still
   be unavailable to future QA jobs.
6. error_sns_topic_name will be unused, but it must be set (e.g. as the empty string).
7. Make any other parameter changes you want to test (e.g. table_config or spark_cleaning_code).
EOF
  default     = ""
}
