#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import sys
import ipaddr
import json
from functools import partial


idx2name = [  # head -1 GeoIP2-Anonymous-IP-Blocks-IPv4.csv | sed -e 's:\([a-z_]*\):"\1":g' -e 's:,:\n,:g'
      "network", "anon"     # useless, because exists in all records
    , "vpn", "hosting", "proxy", "tor", "resident-proxy"
]

WANTED_COLUMNS_QTY = len(idx2name)
FLAGS_START_IDX = 2  # skip "anon"

checked_attr_names = idx2name[FLAGS_START_IDX:]
stats = {}

mode_conv = "convert"
mode_check = "check"


def parse_args():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--mode', \
                        default=mode_conv, \
                        help="%s|%s" % (mode_conv, mode_check))
    parser.add_argument('--json-traits',
                        action='store_true',
                        help="non-standard format of traits in range data (for maxmind - comma separated list of flags)")
    parser.add_argument('--ignore-ip-flags',
                        default='resident-proxy',
                        help="skip these flags during testing")
    parser.add_argument('--datafile', \
                        default="-", \
                        help="path to file with maxmind-anon-data (GeoIP2-Anonymous-IP-Blocks-IPv[46].csv); '-' for stdin.")
    parser.add_argument('--geodata', \
                        default="", \
                        help="path to geodata for checking")
    parser.add_argument('--output', \
                        default="", \
                        help="path to output datafile; empty for stdout")
    return parser.parse_args()


def inc_stat_counter(counter_name):
    stats[counter_name] = stats.get(counter_name, 0) + 1


def extract_traits_from_mmdata(parts):
    traits = {}

    for idx in range(FLAGS_START_IDX, WANTED_COLUMNS_QTY):
        if not parts[idx]:
            continue

        trait_name = idx2name[idx]
        traits[trait_name] = 1
        inc_stat_counter(trait_name)

    return traits


def process_row(line, processor_fn, is_json_traits):
    if not line or line.startswith('#') or not (line[0].isdigit() or line.startswith("::ffff:")):
        inc_stat_counter("useless_lines")
        return True

    if is_json_traits:
        parts = line.strip().split('\t')
        if len(parts) != 2:
            inc_stat_counter("bad_column_lines")
            return False

        ip_begin, ip_end = parts[0].split('-')
        orig_attrs = json.loads(parts[1])
        traits = [x for x in checked_attr_names if x in orig_attrs]
        processor_fn(ipaddr.IPAddress(ip_begin), ipaddr.IPAddress(ip_end), traits)
    else:
        parts = line.strip().split(',')
        if len(parts) != WANTED_COLUMNS_QTY:
            inc_stat_counter("bad_column_lines")
            return False

        net = ipaddr.IPNetwork(parts[0])
        traits = extract_traits_from_mmdata(parts)
        processor_fn(net[0], net[-1], traits)

    return True


def mode_conv_fn(data_output, ip_begin, ip_end, traits):
    traits_str = json.dumps(traits, separators=(',', ':'), sort_keys=True)
    print >>data_output, "%s-%s\t%s" % (ip_begin, ip_end, traits_str)


def mode_check_fn(geo6lookup, data_output, ignored_flags, ip_begin, ip_end, traits):
    def extract_flags(flags, ignored_flags):
        return [x for x in checked_attr_names if x in ip_traits_begin and x not in ignored_flags]

    ip_traits_begin = geo6lookup.get_ip_traits(str(ip_begin))
    flags_begin_list = extract_flags(ip_traits_begin, ignored_flags)
    inc_stat_counter("geo_traits_begin")

    ip_traits_end = geo6lookup.get_ip_traits(str(ip_end))
    flags_end_list = extract_flags(ip_traits_end, ignored_flags)
    inc_stat_counter("geo_traits_end")

    err_parts = []

    if flags_begin_list != flags_end_list:
        inc_stat_counter("err-geo-diff")
        err_parts.append("/GEO-DIFF (%s/%s)" % (','.join(flags_begin_list), ','.join(flags_end_list)))

    curr_flags_list = extract_flags(traits, ignored_flags)
    if curr_flags_list != flags_begin_list:
        inc_stat_counter("err-mm-geo")
        err_parts.append("/MM-GEO (%s)" % (','.join(flags_begin_list)))

    for x in ignored_flags:
        if x in traits:
            inc_stat_counter("was_ignored_%s" % x)

    if err_parts:
        traits_str = json.dumps(traits, separators=(',', ':'), sort_keys=True)
        print >>data_output, "%s-%s\t%s\t//\t%s" % (ip_begin, ip_end, traits_str, "\t".join(err_parts))


def process_source(args):
    data_input = sys.stdin  if '-' == args.datafile else open(args.datafile, 'r')
    data_output = sys.stdout if not args.output else open(args.output, 'w')

    proc_fn = None
    if args.mode == mode_conv:
        proc_fn = partial(mode_conv_fn, data_output)
    elif args.mode == mode_check:
        import geobase6
        lookup = geobase6.Lookup(args.geodata)
        ignored_flags = args.ignore_ip_flags.split(',')
        proc_fn = partial(mode_check_fn, lookup, data_output, ignored_flags)

    err_qty = 0
    for line in data_input:
        try:
            inc_stat_counter("input_lines")
            if process_row(line, proc_fn, args.json_traits):
                continue
            inc_stat_counter("bad_input_lines")
        except Exception as ex:
            print >>sys.stderr, "err[%s] in [%s]" % (ex, line)
            inc_stat_counter("exceptions")
            err_qty += 1

    print >>sys.stderr, stats
    return err_qty


if __name__ == "__main__":
    sys.exit(process_source(parse_args()))
