#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import sys
from csv import reader
import ipaddr
import json
from functools import partial

idx2name = [  # head -1 GeoIP2-ISP-Blocks-IPv4.csv | sed -e 's:\([a-z_]*\):"\1":g' -e 's:,:\n,:g'
    "network", "isp", "org", "as_num", "as_org"
]


WANTED_COLUMNS_QTY = len(idx2name)
NET_IDX = 0
ISP_IDX = 1
ORG_IDX = 2
ASN_IDX = 3

orgnames2id = {}
id2orgnames = {}
checked_attr_name = ["org_name", "isp_name", "asn_list"]  # from libgeobase-py-binding
stats = {}

mode_conv = "convert"
mode_check = "check"


def parse_args():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--mode', \
                        default=mode_conv, \
                        help="%s|%s" % (mode_conv, mode_check))
    parser.add_argument('--datafile', \
                        default="-", \
                        help="path to file with maxmind-isp-data (GeoIP2-ISP-Blocks-IPv[46].csv); '-' for stdin.")
    parser.add_argument('--geodata', \
                        default="", \
                        help="path to geodata for checking")
    parser.add_argument('--orgs-output', \
                        default="", \
                        help="path to output file for id-to-name of organizations matching.")
    parser.add_argument('--output', \
                        default="", \
                        help="path to output datafile; empty for stdout")
    parser.add_argument('--no-subst',
                        action='store_true',
                        help="dont substitute names via codes, @see mode=%s" % mode_conv)
    parser.add_argument('--disabled-attrs-list', \
                        default="", \
                        help="comma-separated list of attr (%s)" % (",".join(idx2name)))
    return parser.parse_args()


def inc_stat_counter(counter_name):
    stats[counter_name] = stats.get(counter_name, 0) + 1


def generate_id_by_name(name, org_kind):
    next_org_id = 1 + len(orgnames2id)
    name = name.lower()

    traits = orgnames2id.get(name, None)
    if not traits:
        traits = {"id": next_org_id, idx2name[ISP_IDX]: 0, idx2name[ORG_IDX]: 0}
        inc_stat_counter(org_kind)
    traits[org_kind] += 1
    orgnames2id[name] = traits
    id2orgnames[next_org_id] = name

    return traits["id"]


def get_name_by_id(name_id):
    return id2orgnames[name_id]


def setup_name(traits, org_kind, column_value):
    if not column_value:
        return

    traits[org_kind] = generate_id_by_name(column_value, org_kind)


def process_row(columns, disabled_attrs, proc_fn):
    if not columns or columns[NET_IDX].startswith('#') or not columns[NET_IDX][0].isdigit():
        inc_stat_counter("useless_lines")
        return True

    if len(columns) < WANTED_COLUMNS_QTY:
        inc_stat_counter("bad_column_lines")
        return False

    if not columns[ISP_IDX] and not columns[ORG_IDX]:
        inc_stat_counter("empty_names")
        return False

    if columns[ISP_IDX] and columns[ORG_IDX] and columns[ISP_IDX] == columns[ORG_IDX]:
        inc_stat_counter("isp_eq_org")

    net = ipaddr.IPNetwork(columns[NET_IDX])
    traits = {}

    if idx2name[ISP_IDX] not in disabled_attrs:
        setup_name(traits, idx2name[ISP_IDX], columns[ISP_IDX])
    if idx2name[ORG_IDX] not in disabled_attrs:
        setup_name(traits, idx2name[ORG_IDX], columns[ORG_IDX])

    if columns[ASN_IDX] and idx2name[ASN_IDX] not in disabled_attrs:
        traits[idx2name[ASN_IDX]] = columns[ASN_IDX]
        inc_stat_counter(idx2name[ASN_IDX])

    proc_fn(net[0], net[-1], traits)
    return True


def mode_conv_fn(no_subst, data_output, ip_begin, ip_end, traits):
    if no_subst:
        if traits.get(idx2name[ISP_IDX]):
            traits[idx2name[ISP_IDX]] = get_name_by_id(traits.get(idx2name[ISP_IDX]))
        if traits.get(idx2name[ORG_IDX]):
            traits[idx2name[ORG_IDX]] = get_name_by_id(traits.get(idx2name[ORG_IDX]))

    traits_str = json.dumps(traits, separators=(',', ':'), sort_keys=True)
    print >>data_output, "%s-%s\t%s" % (ip_begin, ip_end, traits_str)


def mode_check_fn(geo6lookup, data_output, ip_begin, ip_end, traits):
    ip_traits_begin = geo6lookup.get_ip_traits(str(ip_begin))
    inc_stat_counter("geo_traits_begin")

    ip_traits_end = geo6lookup.get_ip_traits(str(ip_end))
    inc_stat_counter("geo_traits_end")

    diff_geo_values = ["%s('%s'!='%s')" % (x, ip_traits_begin.get(x, ""), ip_traits_end.get(x, "")) \
                        for x in checked_attr_name if ip_traits_begin.get(x, "") != ip_traits_end.get(x, "")]
    err_parts = []

    if diff_geo_values:
        inc_stat_counter("warn-geo-diff")
        err_parts.append("/GEO-DIFF (%s)" % ",".join(diff_geo_values))

    if traits.get("isp", None) and id2orgnames[traits["isp"]] != ip_traits_begin['isp_name']:
        inc_stat_counter("err-isp")
        err_parts.append("ISP (%s != %s)" % (id2orgnames[traits["isp"]], ip_traits_begin['isp_name']))

    if traits.get("org", None) and id2orgnames[traits["org"]] != ip_traits_begin['org_name']:
        inc_stat_counter("err-org")
        err_parts.append("ORG (%s != %s)" % (id2orgnames[traits["isp"]], ip_traits_begin['org_name']))

    if traits.get("as_num", None):
        if not ip_traits_begin.get("asn_list", None) and not ip_traits_end.get("asn_list", None):
            inc_stat_counter("err-mm-only")
            err_parts.append("MM-ONLY")
        else:
            asn_begin_str = ip_traits_begin.get("asn_list", "")
            asn_begin_list = asn_begin_str.split(',')

            asn_end_str = ip_traits_end.get("asn_list", "")
            asn_end_list = asn_end_str.split(',')

            all_geo_uniq_asn_for_range = [x for x in set(asn_begin_list + asn_end_list)]

            if traits["as_num"] not in all_geo_uniq_asn_for_range:
                inc_stat_counter("err-mm-geo")
                err_parts.append("MM!=GEO (%s/%s)" % (asn_begin_str, asn_end_str))

    if err_parts:
        traits_str = json.dumps(traits, separators=(',', ':'), sort_keys=True)
        print >>data_output, "%s-%s\t%s\t//\t%s" % (ip_begin, ip_end, traits_str, "\t".join(err_parts))


def process_source(args):
    data_input = sys.stdin  if '-' == args.datafile else open(args.datafile, 'r')
    data_output = sys.stdout if not args.output else open(args.output, 'w')
    disabled_attrs = args.disabled_attrs_list.split(',')

    proc_fn = None
    if args.mode == mode_conv:
        proc_fn = partial(mode_conv_fn, args.no_subst, data_output)
    elif args.mode == mode_check:
        import geobase6
        lookup = geobase6.Lookup(args.geodata)
        proc_fn = partial(mode_check_fn, lookup, data_output)

    err_qty = 0
    for columns in reader(data_input):
        try:
            inc_stat_counter("input_lines")
            if process_row(columns, disabled_attrs, proc_fn):
                continue
            inc_stat_counter("bad_input_lines")
        except Exception as ex:
            print >>sys.stderr, "err[%s] in [%s]" % (ex, columns)
            inc_stat_counter("exceptions")
            err_qty += 1

    if args.mode == mode_conv and args.orgs_output:
        with open(args.orgs_output, 'w') as out_org:
            for org_name, org_traits in orgnames2id.iteritems():
                print >>out_org, "%d\t%s\t%d\t%d" % (org_traits["id"], org_name, org_traits["isp"], org_traits["org"])

    print >>sys.stderr, stats
    return err_qty


if __name__ == "__main__":
    sys.exit(process_source(parse_args()))
