# -*- coding: utf-8 -*-

import urllib
from lxml import etree
import re

from sandbox import sdk2


class AsnNamesResource(sdk2.Resource):
    """
    TSV file with names of ASNs (see https://en.wikipedia.org/wiki/Autonomous_system_(Internet) )
    """
    any_arch = True
    executable = False
    releasable = False


RESULT_FILE = 'AsnNames.tsv'
CIDR_REPORT_URL = 'https://www.cidr-report.org/as2.0/autnums.html'
AS_NUM_REGEXP = re.compile('AS\d+')


class AsnNamesCollectTask(sdk2.Task):
    """
    Download and parse HTML from https://www.cidr-report.org/as2.0/autnums.html
    """

    def on_execute(self):
        html_name = 'autnums.html'
        urllib.urlretrieve(CIDR_REPORT_URL, html_name)

        content = open(html_name).read().decode('latin-1').encode('utf-8')

        html = etree.HTML(content)
        links = html.xpath("/html/body/pre/a")

        with open(RESULT_FILE, "w") as out_file:
            for link in links:
                as_num = link.text.strip()
                as_name = link.tail.strip()

                assert AS_NUM_REGEXP.match(as_num), 'AS number should match {} pattern. Got: {}'.format(AS_NUM_REGEXP.pattern, as_num)

                out_file.write(as_num)
                out_file.write('\t')
                out_file.write(as_name.encode('utf-8'))
                out_file.write('\n')

        resource = sdk2.ResourceData(AsnNamesResource(self, "TSV file with names of ASNs", RESULT_FILE))
        resource.ready()
